aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-28 15:24:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-28 15:24:36 -0700
commit1b98f357dadd6ea613a435fbaef1a5dd7b35fd21 (patch)
tree32a7195aead30f4dcadf3c3f897df2b4611b88b8 /net
parentMerge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux (diff)
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net (diff)
downloadwireguard-linux-1b98f357dadd6ea613a435fbaef1a5dd7b35fd21.tar.xz
wireguard-linux-1b98f357dadd6ea613a435fbaef1a5dd7b35fd21.zip
Merge tag 'net-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Implement the Device Memory TCP transmit path, allowing zero-copy data transmission on top of TCP from e.g. GPU memory to the wire. - Move all the IPv6 routing tables management outside the RTNL scope, under its own lock and RCU. The route control path is now 3x times faster. - Convert queue related netlink ops to instance lock, reducing again the scope of the RTNL lock. This improves the control plane scalability. - Refactor the software crc32c implementation, removing unneeded abstraction layers and improving significantly the related micro-benchmarks. - Optimize the GRO engine for UDP-tunneled traffic, for a 10% performance improvement in related stream tests. - Cover more per-CPU storage with local nested BH locking; this is a prep work to remove the current per-CPU lock in local_bh_disable() on PREMPT_RT. - Introduce and use nlmsg_payload helper, combining buffer bounds verification with accessing payload carried by netlink messages. Netfilter: - Rewrite the procfs conntrack table implementation, improving considerably the dump performance. A lot of user-space tools still use this interface. - Implement support for wildcard netdevice in netdev basechain and flowtables. - Integrate conntrack information into nft trace infrastructure. - Export set count and backend name to userspace, for better introspection. BPF: - BPF qdisc support: BPF-qdisc can be implemented with BPF struct_ops programs and can be controlled in similar way to traditional qdiscs using the "tc qdisc" command. - Refactor the UDP socket iterator, addressing long standing issues WRT duplicate hits or missed sockets. Protocols: - Improve TCP receive buffer auto-tuning and increase the default upper bound for the receive buffer; overall this improves the single flow maximum thoughput on 200Gbs link by over 60%. - Add AFS GSSAPI security class to AF_RXRPC; it provides transport security for connections to the AFS fileserver and VL server. - Improve TCP multipath routing, so that the sources address always matches the nexthop device. - Introduce SO_PASSRIGHTS for AF_UNIX, to allow disabling SCM_RIGHTS, and thus preventing DoS caused by passing around problematic FDs. - Retire DCCP socket. DCCP only receives updates for bugs, and major distros disable it by default. Its removal allows for better organisation of TCP fields to reduce the number of cache lines hit in the fast path. - Extend TCP drop-reason support to cover PAWS checks. Driver API: - Reorganize PTP ioctl flag support to require an explicit opt-in for the drivers, avoiding the problem of drivers not rejecting new unsupported flags. - Converted several device drivers to timestamping APIs. - Introduce per-PHY ethtool dump helpers, improving the support for dump operations targeting PHYs. Tests and tooling: - Add support for classic netlink in user space C codegen, so that ynl-c can now read, create and modify links, routes addresses and qdisc layer configuration. - Add ynl sub-types for binary attributes, allowing ynl-c to output known struct instead of raw binary data, clarifying the classic netlink output. - Extend MPTCP selftests to improve the code-coverage. - Add tests for XDP tail adjustment in AF_XDP. New hardware / drivers: - OpenVPN virtual driver: offload OpenVPN data channels processing to the kernel-space, increasing the data transfer throughput WRT the user-space implementation. - Renesas glue driver for the gigabit ethernet RZ/V2H(P) SoC. - Broadcom asp-v3.0 ethernet driver. - AMD Renoir ethernet device. - ReakTek MT9888 2.5G ethernet PHY driver. - Aeonsemi 10G C45 PHYs driver. Drivers: - Ethernet high-speed NICs: - nVidia/Mellanox (mlx5): - refactor the steering table handling to significantly reduce the amount of memory used - add support for complex matches in H/W flow steering - improve flow streeing error handling - convert to netdev instance locking - Intel (100G, ice, igb, ixgbe, idpf): - ice: add switchdev support for LLDP traffic over VF - ixgbe: add firmware manipulation and regions devlink support - igb: introduce support for frame transmission premption - igb: adds persistent NAPI configuration - idpf: introduce RDMA support - idpf: add initial PTP support - Meta (fbnic): - extend hardware stats coverage - add devlink dev flash support - Broadcom (bnxt): - add support for RX-side device memory TCP - Wangxun (txgbe): - implement support for udp tunnel offload - complete PTP and SRIOV support for AML 25G/10G devices - Ethernet NICs embedded and virtual: - Google (gve): - add device memory TCP TX support - Amazon (ena): - support persistent per-NAPI config - Airoha: - add H/W support for L2 traffic offload - add per flow stats for flow offloading - RealTek (rtl8211): add support for WoL magic packet - Synopsys (stmmac): - dwmac-socfpga 1000BaseX support - add Loongson-2K3000 support - introduce support for hardware-accelerated VLAN stripping - Broadcom (bcmgenet): - expose more H/W stats - Freescale (enetc, dpaa2-eth): - enetc: add MAC filter, VLAN filter RSS and loopback support - dpaa2-eth: convert to H/W timestamping APIs - vxlan: convert FDB table to rhashtable, for better scalabilty - veth: apply qdisc backpressure on full ring to reduce TX drops - Ethernet switches: - Microchip (kzZ88x3): add ETS scheduler support - Ethernet PHYs: - RealTek (rtl8211): - add support for WoL magic packet - add support for PHY LEDs - CAN: - Adds RZ/G3E CANFD support to the rcar_canfd driver. - Preparatory work for CAN-XL support. - Add self-tests framework with support for CAN physical interfaces. - WiFi: - mac80211: - scan improvements with multi-link operation (MLO) - Qualcomm (ath12k): - enable AHB support for IPQ5332 - add monitor interface support to QCN9274 - add multi-link operation support to WCN7850 - add 802.11d scan offload support to WCN7850 - monitor mode for WCN7850, better 6 GHz regulatory - Qualcomm (ath11k): - restore hibernation support - MediaTek (mt76): - WiFi-7 improvements - implement support for mt7990 - Intel (iwlwifi): - enhanced multi-link single-radio (EMLSR) support on 5 GHz links - rework device configuration - RealTek (rtw88): - improve throughput for RTL8814AU - RealTek (rtw89): - add multi-link operation support - STA/P2P concurrency improvements - support different SAR configs by antenna - Bluetooth: - introduce HCI Driver protocol - btintel_pcie: do not generate coredump for diagnostic events - btusb: add HCI Drv commands for configuring altsetting - btusb: add RTL8851BE device 0x0bda:0xb850 - btusb: add new VID/PID 13d3/3584 for MT7922 - btusb: add new VID/PID 13d3/3630 and 13d3/3613 for MT7925 - btnxpuart: implement host-wakeup feature" * tag 'net-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1611 commits) selftests/bpf: Fix bpf selftest build warning selftests: netfilter: Fix skip of wildcard interface test net: phy: mscc: Stop clearing the the UDPv4 checksum for L2 frames net: openvswitch: Fix the dead loop of MPLS parse calipso: Don't call calipso functions for AF_INET sk. selftests/tc-testing: Add a test for HFSC eltree double add with reentrant enqueue behaviour on netem net_sched: hfsc: Address reentrant enqueue adding class to eltree twice octeontx2-pf: QOS: Refactor TC_HTB_LEAF_DEL_LAST callback octeontx2-pf: QOS: Perform cache sync on send queue teardown net: mana: Add support for Multi Vports on Bare metal net: devmem: ncdevmem: remove unused variable net: devmem: ksft: upgrade rx test to send 1K data net: devmem: ksft: add 5 tuple FS support net: devmem: ksft: add exit_wait to make rx test pass net: devmem: ksft: add ipv4 support net: devmem: preserve sockc_err page_pool: fix ugly page_pool formatting net: devmem: move list_add to net_devmem_bind_dmabuf. selftests: netfilter: nft_queue.sh: include file transfer duration in log message net: phy: mscc: Fix memory leak when using one step timestamping ...
Diffstat (limited to 'net')
-rw-r--r--net/802/Makefile5
-rw-r--r--net/802/p8022.c64
-rw-r--r--net/8021q/vlan.c1
-rw-r--r--net/Kconfig7
-rw-r--r--net/Makefile1
-rw-r--r--net/batman-adv/main.c4
-rw-r--r--net/batman-adv/main.h3
-rw-r--r--net/batman-adv/mesh-interface.c15
-rw-r--r--net/batman-adv/send.c4
-rw-r--r--net/batman-adv/translation-table.c2
-rw-r--r--net/bluetooth/Makefile3
-rw-r--r--net/bluetooth/af_bluetooth.c87
-rw-r--r--net/bluetooth/hci_conn.c79
-rw-r--r--net/bluetooth/hci_core.c45
-rw-r--r--net/bluetooth/hci_drv.c105
-rw-r--r--net/bluetooth/hci_event.c40
-rw-r--r--net/bluetooth/hci_sock.c12
-rw-r--r--net/bluetooth/hci_sync.c63
-rw-r--r--net/bluetooth/iso.c30
-rw-r--r--net/bluetooth/mgmt.c3
-rw-r--r--net/bluetooth/mgmt_util.c2
-rw-r--r--net/bridge/br.c22
-rw-r--r--net/bridge/br_arp_nd_proxy.c7
-rw-r--r--net/bridge/br_input.c3
-rw-r--r--net/bridge/br_mdb.c28
-rw-r--r--net/bridge/br_mst.c4
-rw-r--r--net/bridge/br_multicast.c103
-rw-r--r--net/bridge/br_private.h41
-rw-r--r--net/bridge/br_switchdev.c13
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c12
-rw-r--r--net/core/datagram.c90
-rw-r--r--net/core/dev.c183
-rw-r--r--net/core/dev.h22
-rw-r--r--net/core/dev_api.c11
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/devmem.c132
-rw-r--r--net/core/devmem.h84
-rw-r--r--net/core/dst_cache.c30
-rw-r--r--net/core/fib_rules.c14
-rw-r--r--net/core/lock_debug.c6
-rw-r--r--net/core/lwtunnel.c15
-rw-r--r--net/core/neighbour.c16
-rw-r--r--net/core/net-procfs.c9
-rw-r--r--net/core/net_namespace.c171
-rw-r--r--net/core/netdev-genl-gen.c13
-rw-r--r--net/core/netdev-genl-gen.h1
-rw-r--r--net/core/netdev-genl.c157
-rw-r--r--net/core/netmem_priv.h33
-rw-r--r--net/core/page_pool.c89
-rw-r--r--net/core/pktgen.c103
-rw-r--r--net/core/rtnetlink.c63
-rw-r--r--net/core/scm.c122
-rw-r--r--net/core/secure_seq.c42
-rw-r--r--net/core/skbuff.c214
-rw-r--r--net/core/sock.c104
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/sysctl_net_core.c6
-rw-r--r--net/core/utils.c8
-rw-r--r--net/core/xdp.c72
-rw-r--r--net/dccp/Kconfig46
-rw-r--r--net/dccp/Makefile30
-rw-r--r--net/dccp/ackvec.c403
-rw-r--r--net/dccp/ackvec.h136
-rw-r--r--net/dccp/ccid.c219
-rw-r--r--net/dccp/ccid.h262
-rw-r--r--net/dccp/ccids/Kconfig55
-rw-r--r--net/dccp/ccids/ccid2.c794
-rw-r--r--net/dccp/ccids/ccid2.h121
-rw-r--r--net/dccp/ccids/ccid3.c866
-rw-r--r--net/dccp/ccids/ccid3.h148
-rw-r--r--net/dccp/ccids/lib/loss_interval.c184
-rw-r--r--net/dccp/ccids/lib/loss_interval.h69
-rw-r--r--net/dccp/ccids/lib/packet_history.c439
-rw-r--r--net/dccp/ccids/lib/packet_history.h142
-rw-r--r--net/dccp/ccids/lib/tfrc.c46
-rw-r--r--net/dccp/ccids/lib/tfrc.h73
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c702
-rw-r--r--net/dccp/dccp.h483
-rw-r--r--net/dccp/diag.c85
-rw-r--r--net/dccp/feat.c1581
-rw-r--r--net/dccp/feat.h133
-rw-r--r--net/dccp/input.c739
-rw-r--r--net/dccp/ipv4.c1101
-rw-r--r--net/dccp/ipv6.c1174
-rw-r--r--net/dccp/ipv6.h27
-rw-r--r--net/dccp/minisocks.c266
-rw-r--r--net/dccp/options.c609
-rw-r--r--net/dccp/output.c708
-rw-r--r--net/dccp/proto.c1293
-rw-r--r--net/dccp/qpolicy.c136
-rw-r--r--net/dccp/sysctl.c107
-rw-r--r--net/dccp/timer.c272
-rw-r--r--net/dccp/trace.h82
-rw-r--r--net/devlink/dev.c2
-rw-r--r--net/devlink/health.c52
-rw-r--r--net/devlink/netlink_gen.c29
-rw-r--r--net/devlink/param.c46
-rw-r--r--net/dsa/port.c10
-rw-r--r--net/dsa/user.c41
-rw-r--r--net/ethtool/common.c29
-rw-r--r--net/ethtool/ioctl.c99
-rw-r--r--net/ethtool/mm.c279
-rw-r--r--net/ethtool/netlink.c217
-rw-r--r--net/ethtool/netlink.h4
-rw-r--r--net/ethtool/phy.c342
-rw-r--r--net/ethtool/tsinfo.c23
-rw-r--r--net/hsr/hsr_device.c5
-rw-r--r--net/hsr/hsr_main.c9
-rw-r--r--net/hsr/hsr_main.h1
-rw-r--r--net/hsr/hsr_slave.c2
-rw-r--r--net/ieee802154/nl-phy.c6
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/devinet.c4
-rw-r--r--net/ipv4/fib_frontend.c8
-rw-r--r--net/ipv4/fib_semantics.c50
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/inet_connection_sock.c23
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_hashtables.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c4
-rw-r--r--net/ipv4/ip_gre.c27
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ip_tunnel.c29
-rw-r--r--net/ipv4/ip_vti.c9
-rw-r--r--net/ipv4/ipip.c9
-rw-r--r--net/ipv4/ipmr.c8
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c11
-rw-r--r--net/ipv4/nexthop.c38
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c26
-rw-r--r--net/ipv4/tcp.c53
-rw-r--r--net/ipv4/tcp_fastopen.c1
-rw-r--r--net/ipv4/tcp_input.c110
-rw-r--r--net/ipv4/tcp_ipv4.c7
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c5
-rw-r--r--net/ipv4/udp.c227
-rw-r--r--net/ipv4/udp_offload.c172
-rw-r--r--net/ipv4/udp_tunnel_core.c15
-rw-r--r--net/ipv6/addrconf.c12
-rw-r--r--net/ipv6/addrlabel.c8
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/ioam6_iptunnel.c76
-rw-r--r--net/ipv6/ip6_fib.c115
-rw-r--r--net/ipv6/ip6_gre.c22
-rw-r--r--net/ipv6/ip6_output.c5
-rw-r--r--net/ipv6/ip6_tunnel.c24
-rw-r--r--net/ipv6/ip6_vti.c27
-rw-r--r--net/ipv6/netfilter.c12
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c6
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c17
-rw-r--r--net/ipv6/route.c424
-rw-r--r--net/ipv6/seg6_hmac.c13
-rw-r--r--net/ipv6/sit.c23
-rw-r--r--net/ipv6/tcp_ipv6.c5
-rw-r--r--net/ipv6/udp.c2
-rw-r--r--net/ipv6/udp_offload.c5
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/mac80211/cfg.c78
-rw-r--r--net/mac80211/chan.c3
-rw-r--r--net/mac80211/debugfs_sta.c6
-rw-r--r--net/mac80211/ibss.c19
-rw-r--r--net/mac80211/ieee80211_i.h16
-rw-r--r--net/mac80211/iface.c87
-rw-r--r--net/mac80211/link.c90
-rw-r--r--net/mac80211/mesh.c10
-rw-r--r--net/mac80211/mesh_hwmp.c6
-rw-r--r--net/mac80211/mesh_pathtbl.c2
-rw-r--r--net/mac80211/mesh_plink.c10
-rw-r--r--net/mac80211/mlme.c4
-rw-r--r--net/mac80211/parse.c3
-rw-r--r--net/mac80211/rate.c12
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c13
-rw-r--r--net/mac80211/scan.c18
-rw-r--r--net/mac80211/spectmgmt.c55
-rw-r--r--net/mac80211/sta_info.c28
-rw-r--r--net/mac80211/sta_info.h11
-rw-r--r--net/mac80211/tdls.c4
-rw-r--r--net/mac80211/tx.c35
-rw-r--r--net/mac80211/util.c25
-rw-r--r--net/mctp/device.c4
-rw-r--r--net/mctp/neigh.c5
-rw-r--r--net/mpls/af_mpls.c8
-rw-r--r--net/mptcp/mib.c1
-rw-r--r--net/mptcp/mib.h1
-rw-r--r--net/mptcp/pm.c5
-rw-r--r--net/mptcp/protocol.c12
-rw-r--r--net/mptcp/protocol.h10
-rw-r--r--net/mptcp/sched.c35
-rw-r--r--net/mptcp/subflow.c12
-rw-r--r--net/ncsi/internal.h23
-rw-r--r--net/ncsi/ncsi-pkt.h23
-rw-r--r--net/ncsi/ncsi-rsp.c39
-rw-r--r--net/netfilter/Kconfig6
-rw-r--r--net/netfilter/core.c3
-rw-r--r--net/netfilter/ipvs/Kconfig2
-rw-r--r--net/netfilter/nf_conntrack_core.c10
-rw-r--r--net/netfilter/nf_conntrack_standalone.c88
-rw-r--r--net/netfilter/nf_dup_netdev.c22
-rw-r--r--net/netfilter/nf_tables_api.c428
-rw-r--r--net/netfilter/nf_tables_offload.c51
-rw-r--r--net/netfilter/nf_tables_trace.c54
-rw-r--r--net/netfilter/nfnetlink.c1
-rw-r--r--net/netfilter/nft_chain_filter.c94
-rw-r--r--net/netfilter/nft_flow_offload.c2
-rw-r--r--net/netfilter/nft_inner.c18
-rw-r--r--net/netfilter/nft_quota.c20
-rw-r--r--net/netfilter/nft_set_pipapo.c64
-rw-r--r--net/netfilter/nft_tunnel.c8
-rw-r--r--net/netfilter/xt_IDLETIMER.c12
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c4
-rw-r--r--net/netfilter/xt_cgroup.c26
-rw-r--r--net/netfilter/xt_mark.c2
-rw-r--r--net/netlabel/netlabel_kapi.c3
-rw-r--r--net/netlink/policy.c5
-rw-r--r--net/openvswitch/Kconfig2
-rw-r--r--net/openvswitch/actions.c86
-rw-r--r--net/openvswitch/datapath.c33
-rw-r--r--net/openvswitch/datapath.h52
-rw-r--r--net/openvswitch/flow.c2
-rw-r--r--net/openvswitch/flow_netlink.c3
-rw-r--r--net/packet/af_packet.c21
-rw-r--r--net/packet/internal.h1
-rw-r--r--net/rds/connection.c6
-rw-r--r--net/rds/page.c25
-rw-r--r--net/rxrpc/Kconfig23
-rw-r--r--net/rxrpc/Makefile6
-rw-r--r--net/rxrpc/af_rxrpc.c130
-rw-r--r--net/rxrpc/ar-internal.h83
-rw-r--r--net/rxrpc/call_accept.c34
-rw-r--r--net/rxrpc/call_object.c24
-rw-r--r--net/rxrpc/conn_event.c134
-rw-r--r--net/rxrpc/conn_object.c2
-rw-r--r--net/rxrpc/insecure.c13
-rw-r--r--net/rxrpc/io_thread.c12
-rw-r--r--net/rxrpc/key.c187
-rw-r--r--net/rxrpc/oob.c379
-rw-r--r--net/rxrpc/output.c60
-rw-r--r--net/rxrpc/peer_object.c22
-rw-r--r--net/rxrpc/protocol.h20
-rw-r--r--net/rxrpc/recvmsg.c132
-rw-r--r--net/rxrpc/rxgk.c1371
-rw-r--r--net/rxrpc/rxgk_app.c286
-rw-r--r--net/rxrpc/rxgk_common.h139
-rw-r--r--net/rxrpc/rxgk_kdf.c288
-rw-r--r--net/rxrpc/rxkad.c296
-rw-r--r--net/rxrpc/rxperf.c78
-rw-r--r--net/rxrpc/security.c3
-rw-r--r--net/rxrpc/sendmsg.c25
-rw-r--r--net/rxrpc/server_key.c42
-rw-r--r--net/rxrpc/txbuf.c8
-rw-r--r--net/sched/Kconfig14
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c16
-rw-r--r--net/sched/act_mirred.c28
-rw-r--r--net/sched/bpf_qdisc.c475
-rw-r--r--net/sched/sch_api.c11
-rw-r--r--net/sched/sch_frag.c10
-rw-r--r--net/sched/sch_generic.c7
-rw-r--r--net/sched/sch_hfsc.c9
-rw-r--r--net/sctp/Kconfig2
-rw-r--r--net/sctp/associola.c18
-rw-r--r--net/sctp/offload.c1
-rw-r--r--net/sctp/sm_make_chunk.c8
-rw-r--r--net/sctp/socket.c9
-rw-r--r--net/strparser/strparser.c13
-rw-r--r--net/tipc/crypto.c2
-rw-r--r--net/tipc/link.c2
-rw-r--r--net/tipc/node.c2
-rw-r--r--net/unix/af_unix.c104
-rw-r--r--net/vmw_vsock/af_vsock.c33
-rw-r--r--net/vmw_vsock/virtio_transport_common.c52
-rw-r--r--net/wireless/nl80211.c42
-rw-r--r--net/xdp/xsk_buff_pool.c6
-rw-r--r--net/xfrm/xfrm_device.c18
-rw-r--r--net/xfrm/xfrm_interface_core.c34
-rw-r--r--net/xfrm/xfrm_nat_keepalive.c30
-rw-r--r--net/xfrm/xfrm_policy.c4
-rw-r--r--net/xfrm/xfrm_state.c46
-rw-r--r--net/xfrm/xfrm_user.c77
285 files changed, 9399 insertions, 16402 deletions
diff --git a/net/802/Makefile b/net/802/Makefile
index bfed80221b8b..99abc29d537c 100644
--- a/net/802/Makefile
+++ b/net/802/Makefile
@@ -3,12 +3,11 @@
# Makefile for the Linux 802.x protocol layers.
#
-# Check the p8022 selections against net/core/Makefile.
-obj-$(CONFIG_LLC) += p8022.o psnap.o
+obj-$(CONFIG_LLC) += psnap.o
obj-$(CONFIG_NET_FC) += fc.o
obj-$(CONFIG_FDDI) += fddi.o
obj-$(CONFIG_HIPPI) += hippi.o
-obj-$(CONFIG_ATALK) += p8022.o psnap.o
+obj-$(CONFIG_ATALK) += psnap.o
obj-$(CONFIG_STP) += stp.o
obj-$(CONFIG_GARP) += garp.o
obj-$(CONFIG_MRP) += mrp.o
diff --git a/net/802/p8022.c b/net/802/p8022.c
deleted file mode 100644
index 78c25168d7c9..000000000000
--- a/net/802/p8022.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * NET3: Support for 802.2 demultiplexing off Ethernet
- *
- * Demultiplex 802.2 encoded protocols. We match the entry by the
- * SSAP/DSAP pair and then deliver to the registered datalink that
- * matches. The control byte is ignored and handling of such items
- * is up to the routine passed the frame.
- *
- * Unlike the 802.3 datalink we have a list of 802.2 entries as
- * there are multiple protocols to demux. The list is currently
- * short (3 or 4 entries at most). The current demux assumes this.
- */
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <net/datalink.h>
-#include <linux/mm.h>
-#include <linux/in.h>
-#include <linux/init.h>
-#include <net/llc.h>
-#include <net/p8022.h>
-
-static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
- const unsigned char *dest)
-{
- llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
- return 0;
-}
-
-struct datalink_proto *register_8022_client(unsigned char type,
- int (*func)(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev))
-{
- struct datalink_proto *proto;
-
- proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
- if (proto) {
- proto->type[0] = type;
- proto->header_length = 3;
- proto->request = p8022_request;
- proto->sap = llc_sap_open(type, func);
- if (!proto->sap) {
- kfree(proto);
- proto = NULL;
- }
- }
- return proto;
-}
-
-void unregister_8022_client(struct datalink_proto *proto)
-{
- llc_sap_put(proto->sap);
- kfree(proto);
-}
-
-EXPORT_SYMBOL(register_8022_client);
-EXPORT_SYMBOL(unregister_8022_client);
-
-MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet");
-MODULE_LICENSE("GPL");
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 41be38264493..06908e37c3d9 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -23,7 +23,6 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
-#include <net/p8022.h>
#include <net/arp.h>
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
diff --git a/net/Kconfig b/net/Kconfig
index c3fca69a7c83..ebc80a98fc91 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -68,13 +68,17 @@ config SKB_EXTENSIONS
config NET_DEVMEM
def_bool y
+ select GENERIC_ALLOCATOR
depends on DMA_SHARED_BUFFER
- depends on GENERIC_ALLOCATOR
depends on PAGE_POOL
config NET_SHAPER
bool
+config NET_CRC32C
+ bool
+ select CRC32
+
menu "Networking options"
source "net/packet/Kconfig"
@@ -245,7 +249,6 @@ source "net/bridge/netfilter/Kconfig"
endif
-source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/rds/Kconfig"
source "net/tipc/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 60ed5190eda8..aac960c41db6 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/
ifneq ($(CONFIG_VLAN_8021Q),)
obj-y += 8021q/
endif
-obj-$(CONFIG_IP_DCCP) += dccp/
obj-$(CONFIG_IP_SCTP) += sctp/
obj-$(CONFIG_RDS) += rds/
obj-$(CONFIG_WIRELESS) += wireless/
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index a08132888a3d..c0bc75513355 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -11,7 +11,7 @@
#include <linux/build_bug.h>
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
-#include <linux/crc32c.h>
+#include <linux/crc32.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/gfp.h>
@@ -69,8 +69,6 @@ unsigned int batadv_hardif_generation;
static int (*batadv_rx_handler[256])(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
-unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
struct workqueue_struct *batadv_event_workqueue;
static void batadv_recv_handler_init(void);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 67af435ee04e..692109be2210 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2025.1"
+#define BATADV_SOURCE_VERSION "2025.2"
#endif
/* B.A.T.M.A.N. parameters */
@@ -235,7 +235,6 @@ static inline int batadv_print_vid(unsigned short vid)
extern struct list_head batadv_hardif_list;
extern unsigned int batadv_hardif_generation;
-extern unsigned char batadv_broadcast_addr[];
extern struct workqueue_struct *batadv_event_workqueue;
int batadv_mesh_init(struct net_device *mesh_iface);
diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c
index 59e7b5aacbc9..5bbc366f974d 100644
--- a/net/batman-adv/mesh-interface.c
+++ b/net/batman-adv/mesh-interface.c
@@ -36,7 +36,6 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <net/net_namespace.h>
#include <net/netlink.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -77,18 +76,6 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
return 0;
}
-static int batadv_interface_open(struct net_device *dev)
-{
- netif_start_queue(dev);
- return 0;
-}
-
-static int batadv_interface_release(struct net_device *dev)
-{
- netif_stop_queue(dev);
- return 0;
-}
-
/**
* batadv_sum_counter() - Sum the cpu-local counters for index 'idx'
* @bat_priv: the bat priv with all the mesh interface information
@@ -890,8 +877,6 @@ out:
static const struct net_device_ops batadv_netdev_ops = {
.ndo_init = batadv_meshif_init_late,
- .ndo_open = batadv_interface_open,
- .ndo_stop = batadv_interface_release,
.ndo_get_stats = batadv_interface_stats,
.ndo_vlan_rx_add_vid = batadv_interface_add_vid,
.ndo_vlan_rx_kill_vid = batadv_interface_kill_vid,
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 735ac8077821..9d72f4f15b3d 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -124,7 +124,9 @@ send_skb_err:
int batadv_send_broadcast_skb(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface)
{
- return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+ static const u8 broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+ return batadv_send_skb_packet(skb, hard_iface, broadcast_addr);
}
/**
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 4a3165920de1..8d0e04e770cb 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -14,7 +14,7 @@
#include <linux/cache.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
-#include <linux/crc32c.h>
+#include <linux/crc32.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 5a3835b7dfcd..a7eede7616d8 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o
bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \
- ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o
+ ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \
+ hci_drv.o
bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 0b4d0a8bd361..6ad2f72f53f4 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -34,6 +34,9 @@
#include <net/bluetooth/bluetooth.h>
#include <linux/proc_fs.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
#include "leds.h"
#include "selftest.h"
@@ -563,6 +566,86 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(bt_sock_poll);
+static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index,
+ void __user *useraddr)
+{
+ struct ethtool_ts_info info;
+ struct kernel_ethtool_ts_info ts_info = {};
+ int ret;
+
+ ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info);
+ if (ret == -ENODEV)
+ return ret;
+ else if (ret < 0)
+ return -EIO;
+
+ memset(&info, 0, sizeof(info));
+
+ info.cmd = ETHTOOL_GET_TS_INFO;
+ info.so_timestamping = ts_info.so_timestamping;
+ info.phc_index = ts_info.phc_index;
+ info.tx_types = ts_info.tx_types;
+ info.rx_filters = ts_info.rx_filters;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bt_ethtool(struct sock *sk, const struct ifreq *ifr,
+ void __user *useraddr)
+{
+ unsigned int index;
+ u32 ethcmd;
+ int n;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 ||
+ n != strlen(ifr->ifr_name))
+ return -ENODEV;
+
+ switch (ethcmd) {
+ case ETHTOOL_GET_TS_INFO:
+ return bt_ethtool_get_ts_info(sk, index, useraddr);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
+{
+ struct sock *sk = sock->sk;
+ struct ifreq ifr = {};
+ void __user *data;
+ char *colon;
+ int ret = -ENOIOCTLCMD;
+
+ if (get_user_ifreq(&ifr, &data, arg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ switch (cmd) {
+ case SIOCETHTOOL:
+ ret = bt_ethtool(sk, &ifr, data);
+ break;
+ }
+
+ if (colon)
+ *colon = ':';
+
+ if (put_user_ifreq(&ifr, arg))
+ return -EFAULT;
+
+ return ret;
+}
+
int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
@@ -595,6 +678,10 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = put_user(amount, (int __user *)arg);
break;
+ case SIOCETHTOOL:
+ err = bt_dev_ioctl(sock, cmd, (void __user *)arg);
+ break;
+
default:
err = -ENOIOCTLCMD;
break;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 946d2ae551f8..99efeed6a766 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -785,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c
d->sync_handle = conn->sync_handle;
if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
- hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK,
+ hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK,
HCI_CONN_PA_SYNC, d);
if (!d->count)
@@ -795,7 +795,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c
}
if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) {
- hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK,
+ hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK,
HCI_CONN_BIG_SYNC, d);
if (!d->count)
@@ -885,9 +885,11 @@ static void cis_cleanup(struct hci_conn *conn)
/* Check if ISO connection is a CIS and remove CIG if there are
* no other connections using it.
*/
- hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_BOUND, &d);
- hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &d);
- hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT,
+ &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED,
+ &d);
if (d.count)
return;
@@ -910,7 +912,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
if (!hdev->acl_mtu)
return ERR_PTR(-ECONNREFUSED);
break;
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
if (hdev->iso_mtu)
/* Dedicated ISO Buffer exists */
break;
@@ -974,7 +977,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
break;
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
/* conn->src should reflect the local identity address */
hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
@@ -1071,7 +1075,8 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
if (HCI_CONN_HANDLE_UNSET(conn->handle))
hci_conn_failed(conn, reason);
break;
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
if ((conn->state != BT_CONNECTED &&
!test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) ||
test_bit(HCI_CONN_BIG_CREATED, &conn->flags))
@@ -1146,7 +1151,8 @@ void hci_conn_del(struct hci_conn *conn)
hdev->acl_cnt += conn->sent;
} else {
/* Unacked ISO frames */
- if (conn->type == ISO_LINK) {
+ if (conn->type == CIS_LINK ||
+ conn->type == BIS_LINK) {
if (hdev->iso_pkts)
hdev->iso_cnt += conn->sent;
else if (hdev->le_pkts)
@@ -1532,7 +1538,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
memcmp(conn->le_per_adv_data, base, base_len)))
return ERR_PTR(-EADDRINUSE);
- conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+ conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER);
if (IS_ERR(conn))
return conn;
@@ -1740,7 +1746,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
data.count = 0;
/* Create a BIS for each bound connection */
- hci_conn_hash_list_state(hdev, bis_list, ISO_LINK,
+ hci_conn_hash_list_state(hdev, bis_list, BIS_LINK,
BT_BOUND, &data);
cp.handle = qos->bcast.big;
@@ -1829,12 +1835,12 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
for (data.cig = 0x00; data.cig < 0xf0; data.cig++) {
data.count = 0;
- hci_conn_hash_list_state(hdev, find_cis, ISO_LINK,
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
BT_CONNECT, &data);
if (data.count)
continue;
- hci_conn_hash_list_state(hdev, find_cis, ISO_LINK,
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
BT_CONNECTED, &data);
if (!data.count)
break;
@@ -1884,7 +1890,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
qos->ucast.cis);
if (!cis) {
- cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
+ cis = hci_conn_add_unset(hdev, CIS_LINK, dst,
+ HCI_ROLE_MASTER);
if (IS_ERR(cis))
return cis;
cis->cleanup = cis_cleanup;
@@ -1976,7 +1983,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
int hci_conn_check_create_cis(struct hci_conn *conn)
{
- if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY))
+ if (conn->type != CIS_LINK)
return -EINVAL;
if (!conn->parent || conn->parent->state != BT_CONNECTED ||
@@ -2070,7 +2077,9 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
{
struct hci_conn *conn;
- conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE);
+ bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid);
+
+ conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_SLAVE);
if (IS_ERR(conn))
return conn;
@@ -2219,7 +2228,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
* the start periodic advertising and create BIG commands have
* been queued
*/
- hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK,
+ hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
BT_BOUND, &data);
/* Queue start periodic advertising and create BIG */
@@ -2951,7 +2960,8 @@ void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb)
* TODO: SCO support without flowctl (needs to be done in drivers)
*/
switch (conn->type) {
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
case ACL_LINK:
case LE_LINK:
break;
@@ -3047,3 +3057,36 @@ u8 *hci_conn_key_enc_size(struct hci_conn *conn)
return NULL;
}
+
+int hci_ethtool_ts_info(unsigned int index, int sk_proto,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hci_dev *hdev;
+
+ hdev = hci_dev_get(index);
+ if (!hdev)
+ return -ENODEV;
+
+ info->so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ info->tx_types = BIT(HWTSTAMP_TX_OFF);
+ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE);
+
+ switch (sk_proto) {
+ case BTPROTO_ISO:
+ case BTPROTO_L2CAP:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ case BTPROTO_SCO:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ }
+
+ hci_dev_put(hdev);
+ return 0;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 5eb0600bbd03..3b49828160b7 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2898,12 +2898,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
break;
case HCI_ACLDATA_PKT:
/* Detect if ISO packet has been sent as ACL */
- if (hci_conn_num(hdev, ISO_LINK)) {
+ if (hci_conn_num(hdev, CIS_LINK) ||
+ hci_conn_num(hdev, BIS_LINK)) {
__u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle);
__u8 type;
type = hci_conn_lookup_type(hdev, hci_handle(handle));
- if (type == ISO_LINK)
+ if (type == CIS_LINK || type == BIS_LINK)
hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
}
break;
@@ -2911,6 +2912,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
break;
case HCI_ISODATA_PKT:
break;
+ case HCI_DRV_PKT:
+ break;
default:
kfree_skb(skb);
return -EINVAL;
@@ -3019,6 +3022,15 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
return -EINVAL;
}
+ if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) {
+ /* Intercept HCI Drv packet here and don't go with hdev->send
+ * callback.
+ */
+ err = hci_drv_process_cmd(hdev, skb);
+ kfree_skb(skb);
+ return err;
+ }
+
err = hdev->send(hdev, skb);
if (err < 0) {
bt_dev_err(hdev, "sending frame failed (%d)", err);
@@ -3345,7 +3357,8 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
case LE_LINK:
cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
break;
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
cnt = hdev->iso_mtu ? hdev->iso_cnt :
hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
break;
@@ -3359,7 +3372,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
}
static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
- int *quote)
+ __u8 type2, int *quote)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *conn = NULL, *c;
@@ -3371,7 +3384,8 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != type || skb_queue_empty(&c->data_q))
+ if ((c->type != type && c->type != type2) ||
+ skb_queue_empty(&c->data_q))
continue;
if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
@@ -3579,7 +3593,7 @@ static void hci_sched_sco(struct hci_dev *hdev, __u8 type)
else
cnt = &hdev->sco_cnt;
- while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
+ while (*cnt && (conn = hci_low_sent(hdev, type, type, &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
hci_send_conn_frame(hdev, conn, skb);
@@ -3707,12 +3721,14 @@ static void hci_sched_iso(struct hci_dev *hdev)
BT_DBG("%s", hdev->name);
- if (!hci_conn_num(hdev, ISO_LINK))
+ if (!hci_conn_num(hdev, CIS_LINK) &&
+ !hci_conn_num(hdev, BIS_LINK))
return;
cnt = hdev->iso_pkts ? &hdev->iso_cnt :
hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
- while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, &quote))) {
+ while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK,
+ &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
hci_send_conn_frame(hdev, conn, skb);
@@ -4057,10 +4073,13 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
return;
}
- err = hci_send_frame(hdev, skb);
- if (err < 0) {
- hci_cmd_sync_cancel_sync(hdev, -err);
- return;
+ if (hci_skb_opcode(skb) != HCI_OP_NOP) {
+ err = hci_send_frame(hdev, skb);
+ if (err < 0) {
+ hci_cmd_sync_cancel_sync(hdev, -err);
+ return;
+ }
+ atomic_dec(&hdev->cmd_cnt);
}
if (hdev->req_status == HCI_REQ_PEND &&
@@ -4068,8 +4087,6 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(hdev->req_skb);
hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
}
-
- atomic_dec(&hdev->cmd_cnt);
}
static void hci_cmd_work(struct work_struct *work)
diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c
new file mode 100644
index 000000000000..3dd2d8a006b9
--- /dev/null
+++ b/net/bluetooth/hci_drv.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google Corporation
+ */
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_drv.h>
+
+int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_status *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS);
+ hdr->len = __cpu_to_le16(sizeof(*ev));
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_status);
+
+int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp,
+ size_t rp_len)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_complete *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE);
+ hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len);
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ skb_put_data(skb, rp, rp_len);
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_complete);
+
+int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_drv_cmd_hdr *hdr;
+ const struct hci_drv_handler *handler = NULL;
+ u16 opcode, len, ogf, ocf;
+
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr)
+ return -EILSEQ;
+
+ opcode = __le16_to_cpu(hdr->opcode);
+ len = __le16_to_cpu(hdr->len);
+ if (len != skb->len)
+ return -EILSEQ;
+
+ ogf = hci_opcode_ogf(opcode);
+ ocf = hci_opcode_ocf(opcode);
+
+ if (!hdev->hci_drv)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) {
+ if (opcode < hdev->hci_drv->common_handler_count)
+ handler = &hdev->hci_drv->common_handlers[opcode];
+ } else {
+ if (ocf < hdev->hci_drv->specific_handler_count)
+ handler = &hdev->hci_drv->specific_handlers[ocf];
+ }
+
+ if (!handler || !handler->func)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (len != handler->data_len)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_INVALID_PARAMETERS);
+
+ return handler->func(hdev, skb->data, len);
+}
+EXPORT_SYMBOL(hci_drv_process_cmd);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c38ada69c3d7..66052d6aaa1d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3804,7 +3804,7 @@ static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status)
lockdep_assert_held(&hdev->lock);
list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) {
- if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) ||
+ if (conn->type != CIS_LINK ||
conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig)
continue;
@@ -4467,7 +4467,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
break;
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
if (hdev->iso_pkts) {
hdev->iso_cnt += count;
if (hdev->iso_cnt > hdev->iso_pkts)
@@ -6351,6 +6352,17 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
info->secondary_phy &= 0x1f;
}
+ /* Check if PA Sync is pending and if the hci_conn SID has not
+ * been set update it.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (conn && conn->sid == HCI_SID_INVALID)
+ conn->sid = info->sid;
+ }
+
if (legacy_evt_type != LE_ADV_INVALID) {
process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
info->bdaddr_type, NULL, 0,
@@ -6402,7 +6414,8 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
conn->sync_handle = le16_to_cpu(ev->handle);
conn->sid = HCI_SID_INVALID;
- mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags);
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, BIS_LINK,
+ &flags);
if (!(mask & HCI_LM_ACCEPT)) {
hci_le_pa_term_sync(hdev, ev->handle);
goto unlock;
@@ -6412,7 +6425,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
goto unlock;
/* Add connection to indicate PA sync event */
- pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
+ pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY,
HCI_ROLE_SLAVE);
if (IS_ERR(pa_sync))
@@ -6443,7 +6456,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
hci_dev_lock(hdev);
- mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags);
if (!(mask & HCI_LM_ACCEPT))
goto unlock;
@@ -6727,7 +6740,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
goto unlock;
}
- if (conn->type != ISO_LINK) {
+ if (conn->type != CIS_LINK) {
bt_dev_err(hdev,
"Invalid connection link type handle 0x%4.4x",
handle);
@@ -6845,7 +6858,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
if (!acl)
goto unlock;
- mask = hci_proto_connect_ind(hdev, &acl->dst, ISO_LINK, &flags);
+ mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags);
if (!(mask & HCI_LM_ACCEPT)) {
hci_le_reject_cis(hdev, ev->cis_handle);
goto unlock;
@@ -6853,8 +6866,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
if (!cis) {
- cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE,
- cis_handle);
+ cis = hci_conn_add(hdev, CIS_LINK, &acl->dst,
+ HCI_ROLE_SLAVE, cis_handle);
if (IS_ERR(cis)) {
hci_le_reject_cis(hdev, ev->cis_handle);
goto unlock;
@@ -6969,7 +6982,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
bt_dev_dbg(hdev, "ignore too large handle %u", handle);
continue;
}
- bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
+ bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY,
HCI_ROLE_SLAVE, handle);
if (IS_ERR(bis))
continue;
@@ -7025,7 +7038,7 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
hci_dev_lock(hdev);
- mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags);
if (!(mask & HCI_LM_ACCEPT))
goto unlock;
@@ -7155,7 +7168,8 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
/* Only match event if command OGF is for LE */
if (hdev->req_skb &&
- hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 &&
+ (hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 ||
+ hci_skb_opcode(hdev->req_skb) == HCI_OP_NOP) &&
hci_skb_event(hdev->req_skb) == ev->subevent) {
*opcode = hci_skb_opcode(hdev->req_skb);
hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete,
@@ -7511,8 +7525,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
goto done;
}
+ hci_dev_lock(hdev);
kfree_skb(hdev->recv_event);
hdev->recv_event = skb_clone(skb, GFP_KERNEL);
+ hci_dev_unlock(hdev);
event = hdr->evt;
if (!event) {
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 022b86797acd..428ee5c7de7e 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -234,7 +234,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_ISODATA_PKT)
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT)
continue;
} else {
/* Don't send frame to other channel types */
@@ -391,6 +392,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
else
opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT);
break;
+ case HCI_DRV_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT);
+ break;
case HCI_DIAG_PKT:
opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG);
break;
@@ -1860,7 +1867,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT) {
err = -EINVAL;
goto drop;
}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e56b1cbedab9..62d1ff951ebe 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2860,7 +2860,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
if (sent) {
struct hci_conn *conn;
- conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK,
+ conn = hci_conn_hash_lookup_ba(hdev, BIS_LINK,
&sent->bdaddr);
if (conn) {
struct bt_iso_qos *qos = &conn->iso_qos;
@@ -5477,7 +5477,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn,
if (conn->type == LE_LINK)
return hci_le_connect_cancel_sync(hdev, conn, reason);
- if (conn->type == ISO_LINK) {
+ if (conn->type == CIS_LINK) {
/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
* page 1857:
*
@@ -5490,9 +5490,10 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn,
return hci_disconnect_sync(hdev, conn, reason);
/* CIS with no Create CIS sent have nothing to cancel */
- if (bacmp(&conn->dst, BDADDR_ANY))
- return HCI_ERROR_LOCAL_HOST_TERM;
+ return HCI_ERROR_LOCAL_HOST_TERM;
+ }
+ if (conn->type == BIS_LINK) {
/* There is no way to cancel a BIS without terminating the BIG
* which is done later on connection cleanup.
*/
@@ -5554,9 +5555,12 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
{
struct hci_cp_reject_conn_req cp;
- if (conn->type == ISO_LINK)
+ if (conn->type == CIS_LINK)
return hci_le_reject_cis_sync(hdev, conn, reason);
+ if (conn->type == BIS_LINK)
+ return -EINVAL;
+
if (conn->type == SCO_LINK || conn->type == ESCO_LINK)
return hci_reject_sco_sync(hdev, conn, reason);
@@ -6898,20 +6902,37 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
{
+ struct hci_conn *conn = data;
+ struct hci_conn *pa_sync;
+
bt_dev_dbg(hdev, "err %d", err);
- if (!err)
+ if (err == -ECANCELED)
return;
+ hci_dev_lock(hdev);
+
hci_dev_clear_flag(hdev, HCI_PA_SYNC);
- if (err == -ECANCELED)
- return;
+ if (!hci_conn_valid(hdev, conn))
+ clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
- hci_dev_lock(hdev);
+ if (!err)
+ goto unlock;
- hci_update_passive_scan_sync(hdev);
+ /* Add connection to indicate PA sync error */
+ pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, bt_status(err));
+
+unlock:
hci_dev_unlock(hdev);
}
@@ -6925,9 +6946,23 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
if (!hci_conn_valid(hdev, conn))
return -ECANCELED;
+ if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID)
+ return -EINVAL;
+
if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
return -EBUSY;
+ /* Stop scanning if SID has not been set and active scanning is enabled
+ * so we use passive scanning which will be scanning using the allow
+ * list programmed to contain only the connection address.
+ */
+ if (conn->sid == HCI_SID_INVALID &&
+ hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_scan_disable_sync(hdev);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
+
/* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can
* program the address in the allow list so PA advertisements can be
* received.
@@ -6936,6 +6971,14 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
hci_update_passive_scan_sync(hdev);
+ /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update
+ * it.
+ */
+ if (conn->sid == HCI_SID_INVALID)
+ __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+ HCI_EV_LE_EXT_ADV_REPORT,
+ conn->conn_timeout, NULL);
+
memset(&cp, 0, sizeof(cp));
cp.options = qos->bcast.options;
cp.sid = conn->sid;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 2819cda616bc..6e2c752aaa8f 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -941,7 +941,7 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr,
iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type;
- if (sa->iso_bc->bc_sid > 0x0f)
+ if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID)
return -EINVAL;
iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid;
@@ -1330,6 +1330,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
{
struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
struct sock *sk = sock->sk;
+ int len = sizeof(struct sockaddr_iso);
BT_DBG("sock %p, sk %p", sock, sk);
@@ -1338,12 +1339,20 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
if (peer) {
bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
+
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid;
+ sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis;
+ memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis,
+ ISO_MAX_NUM_BIS);
+ len += sizeof(struct sockaddr_iso_bc);
+ }
} else {
bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src);
sa->iso_bdaddr_type = iso_pi(sk)->src_type;
}
- return sizeof(struct sockaddr_iso);
+ return len;
}
static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg,
@@ -1988,11 +1997,13 @@ static void iso_conn_ready(struct iso_conn *conn)
hcon->dst_type = iso_pi(parent)->dst_type;
}
- if (ev3) {
+ if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
iso_pi(sk)->qos = iso_pi(parent)->qos;
hcon->iso_qos = iso_pi(sk)->qos;
+ iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid;
iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis;
- memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS);
+ memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis,
+ ISO_MAX_NUM_BIS);
set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
}
@@ -2029,6 +2040,9 @@ static bool iso_match_sid(struct sock *sk, void *data)
{
struct hci_ev_le_pa_sync_established *ev = data;
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ return true;
+
return ev->sid == iso_pi(sk)->bc_sid;
}
@@ -2075,8 +2089,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
if (ev1) {
sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
iso_match_sid, ev1);
- if (sk && !ev1->status)
+ if (sk && !ev1->status) {
iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
+ iso_pi(sk)->bc_sid = ev1->sid;
+ }
goto done;
}
@@ -2203,7 +2219,7 @@ done:
static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
{
- if (hcon->type != ISO_LINK) {
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) {
if (hcon->type != LE_LINK)
return;
@@ -2244,7 +2260,7 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
{
- if (hcon->type != ISO_LINK)
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK)
return;
BT_DBG("hcon %p reason %d", hcon, reason);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 46b22708dfbd..261926dccc7e 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3221,7 +3221,8 @@ failed:
static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
{
switch (link_type) {
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
case LE_LINK:
switch (addr_type) {
case ADDR_LE_DEV_PUBLIC:
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index e5ff65e424b5..3713ff490c65 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -304,7 +304,7 @@ void mgmt_mesh_foreach(struct hci_dev *hdev,
{
struct mgmt_mesh_tx *mesh_tx, *tmp;
- list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) {
+ list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) {
if (!sk || mesh_tx->sk == sk)
cb(mesh_tx, data);
}
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 183fcb362f9e..0adeafe11a36 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -284,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
case BR_BOOLOPT_MST_ENABLE:
err = br_mst_set_enabled(br, on, extack);
break;
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on);
+ break;
default:
/* shouldn't be called with unsupported options */
WARN_ON(1);
@@ -302,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
case BR_BOOLOPT_MST_ENABLE:
return br_opt_get(br, BROPT_MST_ENABLED);
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION);
default:
/* shouldn't be called with unsupported options */
WARN_ON(1);
@@ -363,21 +368,20 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
clear_bit(opt, &br->options);
}
-static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
+static void __net_exit br_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
struct net_device *dev;
- struct net *net;
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- for_each_netdev(net, dev)
- if (netif_is_bridge_master(dev))
- br_dev_delete(dev, dev_to_kill);
+ ASSERT_RTNL_NET(net);
+
+ for_each_netdev(net, dev)
+ if (netif_is_bridge_master(dev))
+ br_dev_delete(dev, dev_to_kill);
}
static struct pernet_operations br_net_ops = {
- .exit_batch_rtnl = br_net_exit_batch_rtnl,
+ .exit_rtnl = br_net_exit_rtnl,
};
static const struct stp_proto br_stp_proto = {
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 115a23054a58..1e2b51769eec 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -160,6 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
if (br_is_neigh_suppress_enabled(p, vid))
return;
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ parp->ar_op == htons(ARPOP_REQUEST))
+ return;
if (parp->ar_op != htons(ARPOP_RREQUEST) &&
parp->ar_op != htons(ARPOP_RREPLY) &&
(ipv4_is_zeronet(sip) || sip == tip)) {
@@ -410,6 +413,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (br_is_neigh_suppress_enabled(p, vid))
return;
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
+ return;
+
if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
!msg->icmph.icmp6_solicited) {
/* prevent flooding to neigh suppress ports */
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 232133a0fd21..5f6ac9bf1527 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -189,7 +189,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) {
if ((mdst && mdst->host_joined) ||
- br_multicast_is_router(brmctx, skb)) {
+ br_multicast_is_router(brmctx, skb) ||
+ br->dev->flags & IFF_ALLMULTI) {
local_rcv = true;
DEV_STATS_INC(br->dev, multicast);
}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 722203b98ff7..400eb872b403 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
e->flags |= MDB_FLAGS_STAR_EXCL;
if (flags & MDB_PG_FLAGS_BLOCKED)
e->flags |= MDB_FLAGS_BLOCKED;
+ if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED)
+ e->flags |= MDB_FLAGS_OFFLOAD_FAILED;
}
static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip,
@@ -517,16 +519,17 @@ static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg)
rtnl_mdb_nlmsg_pg_size(pg);
}
-void br_mdb_notify(struct net_device *dev,
- struct net_bridge_mdb_entry *mp,
- struct net_bridge_port_group *pg,
- int type)
+static void __br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type, bool notify_switchdev)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- br_switchdev_mdb_notify(dev, mp, pg, type);
+ if (notify_switchdev)
+ br_switchdev_mdb_notify(dev, mp, pg, type);
skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC);
if (!skb)
@@ -544,6 +547,21 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
+void br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+ __br_mdb_notify(dev, mp, pg, type, true);
+}
+
+void br_mdb_flag_change_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg)
+{
+ __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false);
+}
+
static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
struct net_device *dev,
int ifindex, u16 vid, u32 pid,
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
index 1820f09ff59c..3f24b4ee49c2 100644
--- a/net/bridge/br_mst.c
+++ b/net/bridge/br_mst.c
@@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg,
if (br_vlan_get_state(v) == state)
return;
- br_vlan_set_state(v, state);
-
if (v->vid == vg->pvid)
br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
}
int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index dc331b59b965..fb6f7f2001c9 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -2105,12 +2105,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
}
}
-void br_multicast_enable_port(struct net_bridge_port *port)
+static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
{
- struct net_bridge *br = port->br;
+ struct net_bridge *br = pmctx->port->br;
spin_lock_bh(&br->multicast_lock);
- __br_multicast_enable_port_ctx(&port->multicast_ctx);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+ __br_multicast_enable_port_ctx(pmctx);
spin_unlock_bh(&br->multicast_lock);
}
@@ -2137,11 +2142,67 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
br_multicast_rport_del_notify(pmctx, del);
}
+static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+
+ __br_multicast_disable_port_ctx(pmctx);
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+static void br_multicast_toggle_port(struct net_bridge_port *port, bool on)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ rcu_read_lock();
+ vg = nbp_vlan_group_rcu(port);
+ if (!vg) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* iterate each vlan, toggle vlan multicast context */
+ list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast_port *pmctx =
+ &vlan->port_mcast_ctx;
+ u8 state = br_vlan_get_state(vlan);
+ /* enable vlan multicast context when state is
+ * LEARNING or FORWARDING
+ */
+ if (on && br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(pmctx);
+ else
+ br_multicast_disable_port_ctx(pmctx);
+ }
+ rcu_read_unlock();
+ return;
+ }
+#endif
+ /* toggle port multicast context when vlan snooping is disabled */
+ if (on)
+ br_multicast_enable_port_ctx(&port->multicast_ctx);
+ else
+ br_multicast_disable_port_ctx(&port->multicast_ctx);
+}
+
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+ br_multicast_toggle_port(port, true);
+}
+
void br_multicast_disable_port(struct net_bridge_port *port)
{
- spin_lock_bh(&port->br->multicast_lock);
- __br_multicast_disable_port_ctx(&port->multicast_ctx);
- spin_unlock_bh(&port->br->multicast_lock);
+ br_multicast_toggle_port(port, false);
}
static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
@@ -4211,6 +4272,32 @@ static void __br_multicast_stop(struct net_bridge_mcast *brmctx)
#endif
}
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ struct net_bridge *br;
+
+ if (!br_vlan_should_use(v))
+ return;
+
+ if (br_vlan_is_master(v))
+ return;
+
+ br = v->port->br;
+
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return;
+
+ if (br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(&v->port_mcast_ctx);
+
+ /* Multicast is not disabled for the vlan when it goes in
+ * blocking state because the timers will expire and stop by
+ * themselves without sending more queries.
+ */
+#endif
+}
+
void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on)
{
struct net_bridge *br;
@@ -4304,9 +4391,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
__br_multicast_open(&br->multicast_ctx);
list_for_each_entry(p, &br->port_list, list) {
if (on)
- br_multicast_disable_port(p);
+ br_multicast_disable_port_ctx(&p->multicast_ctx);
else
- br_multicast_enable_port(p);
+ br_multicast_enable_port_ctx(&p->multicast_ctx);
}
list_for_each_entry(vlan, &vg->vlan_list, vlist)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 4715a8d6dc32..b159aae594c0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc {
u16 vlan_id;
};
-#define MDB_PG_FLAGS_PERMANENT BIT(0)
-#define MDB_PG_FLAGS_OFFLOAD BIT(1)
-#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
-#define MDB_PG_FLAGS_STAR_EXCL BIT(3)
-#define MDB_PG_FLAGS_BLOCKED BIT(4)
+#define MDB_PG_FLAGS_PERMANENT BIT(0)
+#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define MDB_PG_FLAGS_STAR_EXCL BIT(3)
+#define MDB_PG_FLAGS_BLOCKED BIT(4)
+#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5)
#define PG_SRC_ENT_LIMIT 32
@@ -483,6 +484,7 @@ enum net_bridge_opts {
BROPT_VLAN_BRIDGE_BINDING,
BROPT_MCAST_VLAN_SNOOPING_ENABLED,
BROPT_MST_ENABLED,
+ BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
};
struct net_bridge {
@@ -1003,6 +1005,8 @@ int br_mdb_hash_init(struct net_bridge *br);
void br_mdb_hash_fini(struct net_bridge *br);
void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
struct net_bridge_port_group *pg, int type);
+void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg);
void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
int type);
void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
@@ -1052,6 +1056,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port,
struct net_bridge_vlan *vlan,
struct net_bridge_mcast_port *pmctx);
void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state);
void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
struct netlink_ext_ack *extack);
@@ -1343,6 +1348,22 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx)
return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx));
}
+
+static inline void
+br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p,
+ bool offloaded)
+{
+ p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED);
+ p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD :
+ MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
+
+static inline bool
+br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags)
+{
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) &&
+ (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
#else
static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
struct net_bridge_mcast_port **pmctx,
@@ -1502,6 +1523,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm
{
}
+static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v,
+ u8 state)
+{
+}
+
static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
bool on)
{
@@ -1862,7 +1888,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
const struct net_bridge_vlan *v_opts);
-/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
+/* vlan state manipulation helpers using *_ONCE to annotate lock-free access,
+ * while br_vlan_set_state() may access data protected by multicast_lock.
+ */
static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
{
return READ_ONCE(v->state);
@@ -1871,6 +1899,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state)
{
WRITE_ONCE(v->state, state);
+ br_multicast_update_vlan_mcast_ctx(v, state);
}
static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg)
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 7b41ee8740cb..95d7355a0407 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -504,9 +504,10 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri
struct net_bridge_mdb_entry *mp;
struct net_bridge_port *port = data->port;
struct net_bridge *br = port->br;
+ u8 old_flags;
- if (err)
- goto err;
+ if (err == -EOPNOTSUPP)
+ goto out_free;
spin_lock_bh(&br->multicast_lock);
mp = br_mdb_ip_get(br, &data->ip);
@@ -516,11 +517,15 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri
pp = &p->next) {
if (p->key.port != port)
continue;
- p->flags |= MDB_PG_FLAGS_OFFLOAD;
+
+ old_flags = p->flags;
+ br_multicast_set_pg_offload_flags(p, !err);
+ if (br_mdb_should_notify(br, old_flags ^ p->flags))
+ br_mdb_flag_change_notify(br->dev, mp, p);
}
out:
spin_unlock_bh(&br->multicast_lock);
-err:
+out_free:
kfree(priv);
}
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 816bb0fde718..6482de4d8750 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -60,19 +60,19 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
struct ip_fraglist_iter iter;
struct sk_buff *frag;
- if (first_len - hlen > mtu ||
- skb_headroom(skb) < ll_rs)
+ if (first_len - hlen > mtu)
goto blackhole;
- if (skb_cloned(skb))
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < ll_rs)
goto slow_path;
skb_walk_frags(skb, frag) {
- if (frag->len > mtu ||
- skb_headroom(frag) < hlen + ll_rs)
+ if (frag->len > mtu)
goto blackhole;
- if (skb_shared(frag))
+ if (skb_shared(frag) ||
+ skb_headroom(frag) < hlen + ll_rs)
goto slow_path;
}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f0693707aece..94cc4705e91d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -52,6 +52,7 @@
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -61,7 +62,8 @@
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
-#include <crypto/hash.h>
+
+#include "devmem.h"
/*
* Is a socket 'connection oriented' ?
@@ -163,8 +165,7 @@ done:
return skb;
}
-struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
- struct sk_buff_head *queue,
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
unsigned int flags,
int *off, int *err,
struct sk_buff **last)
@@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ skb = __skb_try_recv_from_queue(queue, flags, off, &error,
last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
@@ -482,41 +483,37 @@ short_copy:
return 0;
}
-static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
- struct iov_iter *i)
+#ifdef CONFIG_NET_CRC32C
+static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
+ void *_crcp, struct iov_iter *i)
{
-#ifdef CONFIG_CRYPTO_HASH
- struct ahash_request *hash = hashp;
- struct scatterlist sg;
+ u32 *crcp = _crcp;
size_t copied;
copied = copy_to_iter(addr, bytes, i);
- sg_init_one(&sg, addr, copied);
- ahash_request_set_crypt(hash, &sg, NULL, copied);
- crypto_ahash_update(hash);
+ *crcp = crc32c(*crcp, addr, copied);
return copied;
-#else
- return 0;
-#endif
}
/**
- * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
- * and update a hash.
+ * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
+ * and update a CRC32C value.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
- * @hash: hash request to update
+ * @crcp: pointer to CRC32C value to update
+ *
+ * Return: 0 on success, -EFAULT if there was a fault during copy.
*/
-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len,
- struct ahash_request *hash)
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, u32 *crcp)
{
return __skb_datagram_iter(skb, offset, to, len, true,
- hash_and_copy_to_iter, hash);
+ crc32c_and_copy_to_iter, crcp);
}
-EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
+EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
+#endif /* CONFIG_NET_CRC32C */
static size_t simple_copy_to_iter(const void *addr, size_t bytes,
void *data __always_unused, struct iov_iter *i)
@@ -692,9 +689,50 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
return 0;
}
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC &&
+ iov_iter_type(from) != ITER_UBUF)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
+ }
+
+ return 0;
+}
+
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
- size_t length)
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
{
unsigned long orig_size = skb->truesize;
unsigned long truesize;
@@ -702,6 +740,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
if (msg && msg->msg_ubuf && msg->sg_from_iter)
ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
else
ret = zerocopy_fill_skb_from_iter(skb, from, length);
@@ -735,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0d891634c692..2b514d95c528 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -462,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#ifdef CONFIG_LOCKDEP
/*
@@ -828,7 +830,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
dev_hold(dev);
rcu_read_unlock();
- dev = __netdev_put_lock(dev);
+ dev = __netdev_put_lock(dev, net);
if (!dev)
return NULL;
@@ -1039,10 +1041,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)
* This helper is intended for locking net_device after it has been looked up
* using a lockless lookup helper. Lock prevents the instance from going away.
*/
-struct net_device *__netdev_put_lock(struct net_device *dev)
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
{
netdev_lock(dev);
- if (dev->reg_state > NETREG_REGISTERED) {
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
netdev_unlock(dev);
dev_put(dev);
return NULL;
@@ -1051,6 +1054,20 @@ struct net_device *__netdev_put_lock(struct net_device *dev)
return dev;
}
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
/**
* netdev_get_by_index_lock() - find a device by its ifindex
* @net: the applicable net namespace
@@ -1070,7 +1087,19 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
if (!dev)
return NULL;
- return __netdev_put_lock(dev);
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
}
struct net_device *
@@ -1090,7 +1119,32 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev,
dev_hold(dev);
rcu_read_unlock();
- dev = __netdev_put_lock(dev);
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
if (dev)
return dev;
@@ -3542,9 +3596,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
- __le32 crc32c_csum;
+ u32 crc;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -3572,15 +3627,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
if (ret)
goto out;
- crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
- skb->len - start, ~(__u32)0,
- crc32c_csum_stub));
- *(__le32 *)(skb->data + offset) = crc32c_csum;
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
skb_reset_csum_not_inet(skb);
out:
return ret;
}
EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -3844,12 +3898,42 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
- if (!skb_frags_readable(skb))
- goto out_kfree_skb;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
@@ -4731,6 +4815,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
@@ -4946,7 +5031,8 @@ static void rps_trigger_softirq(void *data)
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
- sd->received_rps++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}
#endif /* CONFIG_RPS */
@@ -5031,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
- new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
@@ -5042,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
- fl->count++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock();
return true;
}
@@ -5238,7 +5325,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
struct sk_buff *skb = *pskb;
int err, hroom, troom;
- if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
return 0;
/* In case we have to go down the path and also linearize,
@@ -7387,9 +7477,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
work = __napi_poll(n, &do_repoll);
- if (do_repoll)
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
list_add_tail(&n->poll_list, repoll);
-
+ }
netpoll_poll_unlock(have);
return work;
@@ -7515,7 +7610,8 @@ start:
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
- sd->time_squeeze++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
@@ -9188,8 +9284,16 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
@@ -9565,7 +9669,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);
-int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -9573,15 +9677,15 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ err = dev_pre_changeaddr_notify(dev, ss->__data, extack);
if (err)
return err;
- if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
- err = ops->ndo_set_mac_address(dev, sa);
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
if (err)
return err;
}
@@ -9593,6 +9697,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
DECLARE_RWSEM(dev_addr_sem);
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data_min);
@@ -11047,8 +11152,7 @@ int register_netdevice(struct net_device *dev)
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
@@ -11971,8 +12075,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0,
portid, nlh);
@@ -12146,7 +12249,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
- netdev_unlock_ops(dev);
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
synchronize_net();
@@ -12184,7 +12291,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
move_netdevice_notifiers_dev_net(dev, net);
/* Actually switch the network namespace */
+ netdev_lock(dev);
dev_net_set(dev, net);
+ netdev_unlock(dev);
dev->ifindex = new_ifindex;
if (new_name[0]) {
@@ -12210,7 +12319,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
- netdev_lock_ops(dev);
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
/* Add the device back in the hashes */
list_netdevice(dev);
/* Notify protocols, that a new device appeared. */
@@ -12621,7 +12734,7 @@ static int net_page_pool_create(int cpuid)
return err;
}
- per_cpu(system_page_pool, cpuid) = pp_ptr;
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
return 0;
}
@@ -12751,13 +12864,13 @@ out:
for_each_possible_cpu(i) {
struct page_pool *pp_ptr;
- pp_ptr = per_cpu(system_page_pool, i);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
if (!pp_ptr)
continue;
xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
- per_cpu(system_page_pool, i) = NULL;
+ per_cpu(system_page_pool.pool, i) = NULL;
}
}
diff --git a/net/core/dev.h b/net/core/dev.h
index 7ee203395d8e..e93f36b7ddf3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -15,8 +15,9 @@ struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
- u64 count;
- unsigned int num_buckets;
+ struct rcu_head rcu;
+ unsigned int count;
+ u8 log_buckets;
unsigned int history_head;
u16 history[FLOW_LIMIT_HISTORY];
u8 buckets[];
@@ -29,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
-struct net_device *__netdev_put_lock(struct net_device *dev);
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
unsigned long *index);
@@ -41,6 +42,21 @@ DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
(var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
ifindex++)
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
+ if (_T) netdev_unlock_ops_compat(_T));
+
+#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
+ (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \
+ &ifindex)); \
+ ifindex++)
+
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index f9a160ab596f..1bf0153195f2 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -84,14 +84,15 @@ void dev_set_group(struct net_device *dev, int new_group)
netdev_unlock_ops(dev);
}
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+int dev_set_mac_address_user(struct net_device *dev,
+ struct sockaddr_storage *ss,
struct netlink_ext_ack *extack)
{
int ret;
down_write(&dev_addr_sem);
netdev_lock_ops(dev);
- ret = netif_set_mac_address(dev, sa, extack);
+ ret = netif_set_mac_address(dev, ss, extack);
netdev_unlock_ops(dev);
up_write(&dev_addr_sem);
@@ -319,20 +320,20 @@ EXPORT_SYMBOL(dev_set_allmulti);
/**
* dev_set_mac_address() - change Media Access Control Address
* @dev: device
- * @sa: new address
+ * @ss: new address
* @extack: netlink extended ack
*
* Change the hardware (MAC) address of the device
*
* Return: 0 on success, -errno on failure.
*/
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
struct netlink_ext_ack *extack)
{
int ret;
netdev_lock_ops(dev);
- ret = netif_set_mac_address(dev, sa, extack);
+ ret = netif_set_mac_address(dev, ss, extack);
netdev_unlock_ops(dev);
return ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index fff13a8b48f1..616479e71466 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -572,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
- if (dev->addr_len > sizeof(struct sockaddr))
+ if (dev->addr_len > sizeof(ifr->ifr_hwaddr))
return -EINVAL;
- return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev,
+ (struct sockaddr_storage *)&ifr->ifr_hwaddr,
+ NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 2db428ab6b8b..b3a62ca0df65 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -16,6 +16,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
#include <trace/events/page_pool.h>
#include "devmem.h"
@@ -30,7 +31,7 @@ static const struct memory_provider_ops dmabuf_devmem_ops;
bool net_is_devmem_iov(struct net_iov *niov)
{
- return niov->pp->mp_ops == &dmabuf_devmem_ops;
+ return niov->type == NET_IOV_DMABUF;
}
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
@@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
size_t size, avail;
gen_pool_for_each_chunk(binding->chunk_pool,
@@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
kfree(binding);
}
+EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
unsigned long xa_idx;
unsigned int rxq_idx;
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
if (binding->list.next)
list_del(&binding->list);
@@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
-
net_devmem_dmabuf_binding_put(binding);
}
@@ -166,7 +176,9 @@ err_close_rxq:
}
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
{
struct net_devmem_dmabuf_binding *binding;
@@ -189,13 +201,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
}
binding->dev = dev;
-
- err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
- binding, xa_limit_32b, &id_alloc_next,
- GFP_KERNEL);
- if (err < 0)
- goto err_free_binding;
-
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
refcount_set(&binding->ref, 1);
@@ -208,26 +213,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_id;
+ goto err_free_binding;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
- DMA_FROM_DEVICE);
+ direction);
if (IS_ERR(binding->sgt)) {
err = PTR_ERR(binding->sgt);
NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
goto err_detach;
}
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
/* For simplicity we expect to make PAGE_SIZE allocations, but the
* binding can be much more flexible than that. We may be able to
* allocate MTU sized chunks here. Leave that for future work...
*/
- binding->chunk_pool =
- gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
if (!binding->chunk_pool) {
err = -ENOMEM;
- goto err_unmap;
+ goto err_tx_vec;
}
virtual = 0;
@@ -268,27 +283,38 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
for (i = 0; i < owner->area.num_niovs; i++) {
niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
niov->owner = &owner->area;
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
}
virtual += len;
}
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
+ list_add(&binding->list, &priv->bindings);
+
return binding;
err_free_chunks:
gen_pool_for_each_chunk(binding->chunk_pool,
net_devmem_dmabuf_free_chunk_owner, NULL);
gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
err_unmap:
dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
DMA_FROM_DEVICE);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
-err_free_id:
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
err_free_binding:
kfree(binding);
err_put_dmabuf:
@@ -296,6 +322,74 @@ err_put_dmabuf:
return ERR_PTR(err);
}
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
+
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+ err = -ENODEV;
+ goto out_err;
+ }
+
+ return binding;
+
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
+}
+
/*** "Dmabuf devmem memory provider" ***/
int mp_dmabuf_devmem_init(struct page_pool *pool)
diff --git a/net/core/devmem.h b/net/core/devmem.h
index a1aabc9685cc..e7ba77050b8f 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -11,6 +11,7 @@
#define _NET_DEVMEM_H
#include <net/netmem.h>
+#include <net/netdev_netlink.h>
struct netlink_ext_ack;
@@ -25,12 +26,20 @@ struct net_devmem_dmabuf_binding {
/* The user holds a ref (via the netlink API) for as long as they want
* the binding to remain alive. Each page pool using this binding holds
- * a ref to keep the binding alive. Each allocated net_iov holds a
- * ref.
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
*
* The binding undos itself and unmaps the underlying dmabuf once all
* those refs are dropped and the binding is no longer desired or in
* use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
*/
refcount_t ref;
@@ -46,6 +55,14 @@ struct net_devmem_dmabuf_binding {
* active.
*/
u32 id;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
};
#if defined(CONFIG_NET_DEVMEM)
@@ -62,14 +79,18 @@ struct dmabuf_genpool_chunk_owner {
dma_addr_t base_dma_addr;
};
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding);
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *binding,
struct netlink_ext_ack *extack);
+void net_devmem_bind_tx_release(struct sock *sk);
static inline struct dmabuf_genpool_chunk_owner *
net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
@@ -98,10 +119,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
}
-static inline void
+static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- refcount_inc(&binding->ref);
+ return refcount_inc_not_zero(&binding->ref);
}
static inline void
@@ -110,30 +131,58 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
if (!refcount_dec_and_test(&binding->ref))
return;
- __net_devmem_dmabuf_binding_free(binding);
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
}
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
#else
struct net_devmem_dmabuf_binding;
static inline void
-__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
}
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+ enum dma_data_direction direction,
+ struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
{
return ERR_PTR(-EOPNOTSUPP);
}
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
static inline void
net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
@@ -172,6 +221,25 @@ static inline bool net_is_devmem_iov(struct net_iov *niov)
{
return false;
}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
#endif
#endif /* _NET_DEVMEM_H */
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 70c634b9e7b0..93a04d18e505 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -17,6 +17,7 @@
struct dst_cache_pcpu {
unsigned long refresh_ts;
struct dst_entry *dst;
+ local_lock_t bh_lock;
u32 cookie;
union {
struct in_addr in_saddr;
@@ -65,10 +66,15 @@ fail:
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
{
+ struct dst_entry *dst;
+
if (!dst_cache->cache)
return NULL;
- return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get);
@@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in_saddr.s_addr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst, 0);
idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
@@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst,
rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
@@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
{
+ unsigned int i;
+
dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
gfp | __GFP_ZERO);
if (!dst_cache->cache)
return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
dst_cache_reset(dst_cache);
return 0;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 7af302080a66..8ca634964e36 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -874,13 +874,14 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack, bool rtnl_held)
{
struct fib_rule *rule = NULL, *r, *last = NULL;
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
int err = -EINVAL, unresolved = 0;
struct fib_rules_ops *ops = NULL;
struct nlattr *tb[FRA_MAX + 1];
bool user_priority = false;
+ struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -1002,13 +1003,14 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack, bool rtnl_held)
{
struct fib_rule *rule = NULL, *nlrule = NULL;
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
struct nlattr *tb[FRA_MAX+1];
bool user_priority = false;
+ struct fib_rule_hdr *frh;
int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -1260,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
{
struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
return -EINVAL;
}
- frh = nlmsg_data(nlh);
if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
frh->res1 || frh->res2 || frh->action || frh->flags) {
NL_SET_ERR_MSG(extack,
diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c
index 941e26c1343d..9e9fb25314b9 100644
--- a/net/core/lock_debug.c
+++ b/net/core/lock_debug.c
@@ -18,9 +18,12 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event,
/* Keep enum and don't add default to trigger -Werror=switch */
switch (cmd) {
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_assert_locked(dev);
+ fallthrough;
+ case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_UP:
- case NETDEV_CHANGE:
netdev_ops_assert_locked(dev);
fallthrough;
case NETDEV_DOWN:
@@ -58,7 +61,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event,
case NETDEV_OFFLOAD_XSTATS_DISABLE:
case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
- case NETDEV_XDP_FEAT_CHANGE:
ASSERT_RTNL();
break;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 60f27cb4e54f..f9d76d85d04f 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -149,8 +149,7 @@ int lwtunnel_build_state(struct net *net, u16 encap_type,
}
EXPORT_SYMBOL_GPL(lwtunnel_build_state);
-int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack,
- bool rtnl_is_held)
+int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
{
const struct lwtunnel_encap_ops *ops;
int ret = -EINVAL;
@@ -167,12 +166,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack,
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- if (rtnl_is_held)
- __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
- if (rtnl_is_held)
- rtnl_lock();
-
ops = rcu_access_pointer(lwtun_encaps[encap_type]);
}
}
@@ -186,8 +180,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack,
EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type);
int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
- struct netlink_ext_ack *extack,
- bool rtnl_is_held)
+ struct netlink_ext_ack *extack)
{
struct rtnexthop *rtnh = (struct rtnexthop *)attr;
struct nlattr *nla_entype;
@@ -208,9 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
}
encap_type = nla_get_u16(nla_entype);
- if (lwtunnel_valid_encap_type(encap_type,
- extack,
- rtnl_is_held) != 0)
+ if (lwtunnel_valid_encap_type(encap_type, extack))
return -EOPNOTSUPP;
}
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a07249b59ae1..a6e2c91ec3e7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1517,7 +1517,7 @@ out:
return rc;
out_kfree_skb:
rc = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
@@ -1541,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
}
return err;
}
@@ -2430,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
{
struct ndtmsg *ndtm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) {
+ ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
+ if (!ndtm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
return -EINVAL;
}
- ndtm = nlmsg_data(nlh);
if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
return -EINVAL;
@@ -2747,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ndmsg *ndm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
ndm->ndm_state || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
@@ -2855,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 3e92bf0f9060..4f0f0709a1cb 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
+ /* Pairs with WRITE_ONCE() in skb_flow_limit() */
if (fl)
- flow_limit_count = fl->count;
+ flow_limit_count = READ_ONCE(fl->count);
rcu_read_unlock();
#endif
@@ -144,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- sd->processed, atomic_read(&sd->dropped),
- sd->time_squeeze, 0,
+ READ_ONCE(sd->processed), atomic_read(&sd->dropped),
+ READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count,
+ READ_ONCE(sd->received_rps), flow_limit_count,
input_qlen + process_qlen, (int)seq->index,
input_qlen, process_qlen);
return 0;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0dfdf791ece..42ee7fce3d95 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -163,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops,
}
}
+static void ops_exit_rtnl_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ const struct pernet_operations *saved_ops = ops;
+ LIST_HEAD(dev_kill_list);
+ struct net *net;
+
+ rtnl_lock();
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ __rtnl_net_lock(net);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ if (ops->exit_rtnl)
+ ops->exit_rtnl(net, &dev_kill_list);
+ }
+
+ __rtnl_net_unlock(net);
+ }
+
+ unregister_netdevice_many(&dev_kill_list);
+
+ rtnl_unlock();
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
- struct net *net;
if (ops->exit) {
+ struct net *net;
+
list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
cond_resched();
}
}
+
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
@@ -188,6 +217,56 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}
+static void ops_undo_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list,
+ bool expedite_rcu)
+{
+ const struct pernet_operations *saved_ops;
+ bool hold_rtnl = false;
+
+ if (!ops)
+ ops = list_entry(ops_list, typeof(*ops), list);
+
+ saved_ops = ops;
+
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ hold_rtnl |= !!ops->exit_rtnl;
+ ops_pre_exit_list(ops, net_exit_list);
+ }
+
+ /* Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so the
+ * rcu_barrier() after ops_undo_list() isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
+ */
+ if (expedite_rcu)
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+
+ if (hold_rtnl)
+ ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_exit_list(ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_free_list(ops, net_exit_list);
+}
+
+static void ops_undo_single(struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ LIST_HEAD(ops_list);
+
+ list_add(&ops->list, &ops_list);
+ ops_undo_list(&ops_list, NULL, net_exit_list, false);
+ list_del(&ops->list);
+}
+
/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
@@ -351,9 +430,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
static __net_init int setup_net(struct net *net)
{
/* Must be called with pernet_ops_rwsem held */
- const struct pernet_operations *ops, *saved_ops;
+ const struct pernet_operations *ops;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
int error = 0;
preempt_disable();
@@ -376,29 +454,7 @@ out_undo:
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
- saved_ops = ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- synchronize_rcu();
-
- ops = saved_ops;
- rtnl_lock();
- list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
-
+ ops_undo_list(&pernet_list, ops, &net_exit_list, false);
rcu_barrier();
goto out;
}
@@ -594,11 +650,9 @@ struct task_struct *cleanup_net_task;
static void cleanup_net(struct work_struct *work)
{
- const struct pernet_operations *ops;
- struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
+ struct net *net, *tmp, *last;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
cleanup_net_task = current;
@@ -629,33 +683,7 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
- /* Run all of the network namespace pre_exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- /*
- * Another CPU might be rcu-iterating the list, wait for it.
- * This needs to be before calling the exit() notifiers, so
- * the rcu_barrier() below isn't sufficient alone.
- * Also the pre_exit() and exit() methods need this barrier.
- */
- synchronize_rcu_expedited();
-
- rtnl_lock();
- list_for_each_entry_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- /* Run all of the network namespace exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- /* Free the net generic variables */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
+ ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
up_read(&pernet_ops_rwsem);
@@ -1239,31 +1267,13 @@ void __init net_ns_init(void)
rtnl_register_many(net_ns_rtnl_msg_handlers);
}
-static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
-{
- ops_pre_exit_list(ops, net_exit_list);
- synchronize_rcu();
-
- if (ops->exit_batch_rtnl) {
- LIST_HEAD(dev_kill_list);
-
- rtnl_lock();
- ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
- ops_exit_list(ops, net_exit_list);
-
- ops_free_list(ops, net_exit_list);
-}
-
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ LIST_HEAD(net_exit_list);
struct net *net;
int error;
- LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
if (ops->init || ops->id) {
@@ -1282,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- struct net *net;
LIST_HEAD(net_exit_list);
+ struct net *net;
- list_del(&ops->list);
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
}
#else
@@ -1318,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
} else {
LIST_HEAD(net_exit_list);
+
list_add(&init_net.exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
}
}
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 739f7b6506a6..4fc44587f493 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
};
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 17d39fd64c94..cf3fad74511f 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a877693fecd6..2afa7b2141aa 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -38,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
u64 xdp_rx_meta = 0;
void *hdr;
+ netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */
+
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
@@ -122,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
- err = netdev_nl_dev_fill(netdev, rsp, info);
- else
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
err = -ENODEV;
+ goto err_free_msg;
+ }
- rtnl_unlock();
+ err = netdev_nl_dev_fill(netdev, rsp, info);
+ netdev_unlock(netdev);
if (err)
goto err_free_msg;
@@ -146,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
struct net *net = sock_net(skb->sk);
- struct net_device *netdev;
- int err = 0;
+ int err;
- rtnl_lock();
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
if (err < 0)
- break;
+ return err;
}
- rtnl_unlock();
- return err;
+ return 0;
}
static int
@@ -481,18 +479,15 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info),
+ ifindex);
if (netdev) {
err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
- netdev_unlock(netdev);
+ netdev_unlock_ops_compat(netdev);
} else {
err = -ENODEV;
}
- rtnl_unlock();
-
if (err)
goto err_free_msg;
@@ -541,17 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = netdev_get_by_index_lock(net, ifindex);
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
if (netdev) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
- netdev_unlock(netdev);
+ netdev_unlock_ops_compat(netdev);
} else {
err = -ENODEV;
}
} else {
- for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_ops_compat_scoped(net, netdev,
+ ctx->ifindex) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
@@ -559,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
ctx->txq_idx = 0;
}
}
- rtnl_unlock();
return err;
}
@@ -832,26 +826,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev && netdev->stat_ops) {
- err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
- info, ctx);
- } else {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (!netdev) {
NL_SET_BAD_ATTR(info->extack,
info->attrs[NETDEV_A_QSTATS_IFINDEX]);
- err = netdev ? -EOPNOTSUPP : -ENODEV;
+ return -ENODEV;
}
- } else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ if (netdev->stat_ops) {
err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
info, ctx);
- if (err < 0)
- break;
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = -EOPNOTSUPP;
}
+ netdev_unlock_ops_compat(netdev);
+ return err;
+ }
+
+ for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ if (err < 0)
+ break;
}
- rtnl_unlock();
return err;
}
@@ -908,7 +907,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
+ binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
+ priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock;
@@ -943,8 +943,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unbind;
}
- list_add(&binding->list, &priv->bindings);
-
nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
genlmsg_end(rsp, hdr);
@@ -969,6 +967,81 @@ err_genlmsg_free:
return err;
}
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
+ info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
INIT_LIST_HEAD(&priv->bindings);
@@ -1009,10 +1082,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb,
switch (event) {
case NETDEV_REGISTER:
+ netdev_lock_ops_to_full(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ netdev_unlock_full_to_ops(netdev);
break;
case NETDEV_UNREGISTER:
+ netdev_lock(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ netdev_unlock(netdev);
break;
case NETDEV_XDP_FEAT_CHANGE:
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index 7eadb8393e00..cd95394399b4 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -5,7 +5,7 @@
static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
{
- return __netmem_clear_lsb(netmem)->pp_magic;
+ return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
@@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
+ WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+
__netmem_clear_lsb(netmem)->pp_magic = 0;
}
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
{
__netmem_clear_lsb(netmem)->pp = pool;
@@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem,
{
__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
}
+
+static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return 0;
+
+ magic = __netmem_clear_lsb(netmem)->pp_magic;
+
+ return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
+}
+
+static inline void netmem_set_dma_index(netmem_ref netmem,
+ unsigned long id)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return;
+
+ magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
+ __netmem_clear_lsb(netmem)->pp_magic = magic;
+}
#endif
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 7745ad924ae2..4011eb305cee 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool,
/* Driver calling page_pool_create() also call page_pool_destroy() */
refcount_set(&pool->user_cnt, 1);
- if (pool->dma_map)
- get_device(pool->p.dev);
+ xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
netdev_assert_locked(pool->slow.netdev);
@@ -320,9 +319,7 @@ free_ptr_ring:
static void page_pool_uninit(struct page_pool *pool)
{
ptr_ring_cleanup(&pool->ring, NULL);
-
- if (pool->dma_map)
- put_device(pool->p.dev);
+ xa_destroy(&pool->dma_mapped);
#ifdef CONFIG_PAGE_POOL_STATS
if (!pool->system)
@@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
netmem_ref netmem,
u32 dma_sync_size)
{
- if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
- __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
+ rcu_read_lock();
+ /* re-check under rcu_read_lock() to sync with page_pool_scrub() */
+ if (pool->dma_sync)
+ __page_pool_dma_sync_for_device(pool, netmem,
+ dma_sync_size);
+ rcu_read_unlock();
+ }
}
-static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
dma_addr_t dma;
+ int err;
+ u32 id;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
@@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
if (dma_mapping_error(pool->p.dev, dma))
return false;
- if (page_pool_set_dma_addr_netmem(netmem, dma))
+ if (page_pool_set_dma_addr_netmem(netmem, dma)) {
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
goto unmap_failed;
+ }
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto unset_failed;
+ }
+
+ netmem_set_dma_index(netmem, id);
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
+unset_failed:
+ page_pool_set_dma_addr_netmem(netmem, 0);
unmap_failed:
- WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
@@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
- if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
put_page(page);
return NULL;
}
@@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
*/
for (i = 0; i < nr_pages; i++) {
netmem = pool->alloc.cache[i];
- if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
put_page(netmem_to_page(netmem));
continue;
}
@@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem)
static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
netmem_ref netmem)
{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
dma_addr_t dma;
if (!pool->dma_map)
@@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
*/
return;
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return;
+
dma = page_pool_get_dma_addr_netmem(netmem);
/* When page is unmapped, it cannot be returned to our pool */
@@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr_netmem(netmem, 0);
+ netmem_set_dma_index(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -805,6 +839,10 @@ static bool page_pool_napi_local(const struct page_pool *pool)
const struct napi_struct *napi;
u32 cpuid;
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
if (unlikely(!in_softirq()))
return false;
@@ -829,8 +867,8 @@ void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
if (!allow_direct)
allow_direct = page_pool_napi_local(pool);
- netmem =
- __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
+ netmem = __page_pool_put_page(pool, netmem, dma_sync_size,
+ allow_direct);
if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
/* Cache full, fallback to free pages */
recycle_stat_inc(pool, ring_full);
@@ -1080,8 +1118,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
static void page_pool_scrub(struct page_pool *pool)
{
+ unsigned long id;
+ void *ptr;
+
page_pool_empty_alloc_cache_once(pool);
- pool->destroy_cnt++;
+ if (!pool->destroy_cnt++ && pool->dma_map) {
+ if (pool->dma_sync) {
+ /* Disable page_pool_dma_sync_for_device() */
+ pool->dma_sync = false;
+
+ /* Make sure all concurrent returns that may see the old
+ * value of dma_sync (and thus perform a sync) have
+ * finished before doing the unmapping below. Skip the
+ * wait if the device doesn't actually need syncing, or
+ * if there are no outstanding mapped pages.
+ */
+ if (dma_dev_need_sync(pool->p.dev) &&
+ !xa_empty(&pool->dma_mapped))
+ synchronize_net();
+ }
+
+ xa_for_each(&pool->dma_mapped, id, ptr)
+ __page_pool_release_page_dma(pool, page_to_netmem(ptr));
+ }
/* No more consumers should exist, but producers could still
* be in-flight.
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fe7fdefab994..0ebe5461d4d9 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -177,7 +177,7 @@
#define MAX_IMIX_ENTRIES 20
#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
-#define func_enter() pr_debug("entering %s\n", __func__);
+#define func_enter() pr_debug("entering %s\n", __func__)
#define PKT_FLAGS \
pf(IPV6) /* Interface in IPV6 Mode */ \
@@ -227,12 +227,12 @@ static char *pkt_flag_names[] = {
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
-#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) mutex_lock(&(t->if_lock));
-#define if_unlock(t) mutex_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock))
+#define if_unlock(t) mutex_unlock(&(t->if_lock))
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -283,7 +283,8 @@ struct pktgen_dev {
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
+ * removal by worker thread
+ */
struct page *page;
u64 delay; /* nano-seconds */
@@ -346,10 +347,12 @@ struct pktgen_dev {
__u16 udp_dst_max; /* exclusive, dest UDP port */
/* DSCP + ECN */
- __u8 tos; /* six MSB of (former) IPv4 TOS
- are for dscp codepoint */
- __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
- (see RFC 3260, sec. 4) */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ * are for dscp codepoint
+ */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ * (see RFC 3260, sec. 4)
+ */
/* IMIX */
unsigned int n_imix_entries;
@@ -389,12 +392,12 @@ struct pktgen_dev {
__u8 hh[14];
/* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
+ * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+ *
+ * We fill in SRC address later
+ * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ * 0x08, 0x00
+ * };
*/
__u16 pad; /* pad out the hh struct to an even 16 bytes */
@@ -458,7 +461,8 @@ struct pktgen_thread {
char result[512];
/* Field for thread to receive "posted" events terminate,
- stop ifs etc. */
+ * stop ifs etc.
+ */
u32 control;
int cpu;
@@ -472,8 +476,7 @@ struct pktgen_thread {
#define FIND 0
static const char version[] =
- "Packet Generator for packet performance testing. "
- "Version: " VERSION "\n";
+ "Packet Generator for packet performance testing. Version: " VERSION "\n";
static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
@@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
- " udp_src_min: %d udp_src_max: %d"
- " udp_dst_min: %d udp_dst_max: %d\n",
+ " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
pkt_dev->udp_src_min, pkt_dev->udp_src_max,
pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
@@ -754,6 +756,7 @@ static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen,
for (; i < maxlen; i++) {
int value;
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
value = hex_to_bin(c);
@@ -773,6 +776,7 @@ static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen)
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -799,6 +803,7 @@ static ssize_t num_arg(const char __user *user_buffer, size_t maxlen,
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
if ((c >= '0') && (c <= '9')) {
@@ -816,6 +821,7 @@ static ssize_t strn_len(const char __user *user_buffer, size_t maxlen)
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -974,8 +980,8 @@ static __u32 pktgen_read_flag(const char *f, bool *disable)
}
static ssize_t pktgen_if_write(struct file *file,
- const char __user * user_buffer, size_t count,
- loff_t * offset)
+ const char __user *user_buffer, size_t count,
+ loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_dev *pkt_dev = seq->private;
@@ -1307,9 +1313,9 @@ static ssize_t pktgen_if_write(struct file *file,
put_page(pkt_dev->page);
pkt_dev->page = NULL;
}
- }
- else
+ } else {
sprintf(pg_result, "ERROR: node not possible");
+ }
return count;
}
if (!strcmp(name, "xmit_mode")) {
@@ -1413,8 +1419,7 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strcpy(pkt_dev->dst_min, buf);
+ strscpy_pad(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
@@ -1434,8 +1439,7 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strcpy(pkt_dev->dst_max, buf);
+ strscpy_pad(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
@@ -1544,8 +1548,7 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strcpy(pkt_dev->src_min, buf);
+ strscpy_pad(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
@@ -1565,8 +1568,7 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strcpy(pkt_dev->src_max, buf);
+ strscpy_pad(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
@@ -1909,8 +1911,8 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user * user_buffer,
- size_t count, loff_t * offset)
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_thread *t = seq->private;
@@ -1962,6 +1964,7 @@ static ssize_t pktgen_thread_write(struct file *file,
if (!strcmp(name, "add_device")) {
char f[32];
+
memset(f, 0, 32);
max = min(sizeof(f) - 1, count - i);
len = strn_len(&user_buffer[i], max);
@@ -2397,13 +2400,14 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
/* If there was already an IPSEC SA, we keep it as is, else
* we go look for it ...
-*/
+ */
#define DUMMY_MARK 0
static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
{
#ifdef CONFIG_XFRM
struct xfrm_state *x = pkt_dev->flows[flow].x;
struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+
if (!x) {
if (pkt_dev->spi) {
@@ -2436,6 +2440,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
+
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
t = get_random_u32_inclusive(pkt_dev->queue_map_min,
pkt_dev->queue_map_max);
@@ -2517,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_MPLS_RND) {
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
@@ -2561,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
imx = ntohl(pkt_dev->saddr_max);
if (imn < imx) {
__u32 t;
+
if (pkt_dev->flags & F_IPSRC_RND)
t = get_random_u32_inclusive(imn, imx - 1);
else {
@@ -2581,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
__be32 s;
+
if (pkt_dev->flags & F_IPDST_RND) {
do {
@@ -2628,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
+
if (pkt_dev->flags & F_TXSIZE_RND) {
t = get_random_u32_inclusive(pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size - 1);
@@ -2694,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
if (!x)
return 0;
/* XXX: we dont support tunnel mode for now until
- * we resolve the dst issue */
+ * we resolve the dst issue
+ */
if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
return 0;
@@ -2729,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
if (pkt_dev->cflows) {
/* let go of the SAs if we have them */
int i;
+
for (i = 0; i < pkt_dev->cflows; i++) {
struct xfrm_state *x = pkt_dev->flows[i].x;
+
if (x) {
xfrm_state_put(x);
pkt_dev->flows[i].x = NULL;
@@ -2745,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
if (pkt_dev->flags & F_IPSEC) {
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
+
if (x) {
struct ethhdr *eth;
struct iphdr *iph;
@@ -2788,6 +2801,7 @@ err:
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
@@ -2900,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb->dev = dev;
}
} else {
- skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
/* the caller pre-fetches from skb->data and reserves for the mac hdr */
@@ -2981,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *)&eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
@@ -3210,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t)
set_pkt_overhead(pkt_dev);
- strcpy(pkt_dev->result, "Starting");
+ strscpy(pkt_dev->result, "Starting");
pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
- strcpy(pkt_dev->result, "Error starting");
+ strscpy(pkt_dev->result, "Error starting");
}
rcu_read_unlock();
if (started)
@@ -3473,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
ktime_t idle_start = ktime_get();
+
schedule();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
@@ -3788,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t,
* userspace on another CPU than the kthread. The if_lock()
* is used here to sync with concurrent instances of
* _rem_dev_from_if_list() invoked via kthread, which is also
- * updating the if_list */
+ * updating the if_list
+ */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3826,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
if (!pkt_dev)
return -ENOMEM;
- strcpy(pkt_dev->odevname, ifname);
+ strscpy(pkt_dev->odevname, ifname);
pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
sizeof(struct flow_state)),
node);
@@ -3983,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
- * with proc_create_data() */
+ * with proc_create_data()
+ */
proc_remove(pkt_dev->entry);
/* And update the thread if_list */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c5a7f41982a5..f9a35bdc58ad 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2390,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
@@ -3080,17 +3080,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
}
if (tb[IFLA_ADDRESS]) {
- struct sockaddr *sa;
- int len;
-
- len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
- sizeof(*sa));
- sa = kmalloc(len, GFP_KERNEL);
- if (!sa) {
- err = -ENOMEM;
- goto errout;
- }
- sa->sa_family = dev->type;
+ struct sockaddr_storage ss = { };
netdev_unlock_ops(dev);
@@ -3098,10 +3088,9 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
down_write(&dev_addr_sem);
netdev_lock_ops(dev);
- memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
- dev->addr_len);
- err = netif_set_mac_address(dev, sa, extack);
- kfree(sa);
+ ss.ss_family = dev->type;
+ memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len);
+ err = netif_set_mac_address(dev, &ss, extack);
if (err) {
up_write(&dev_addr_sem);
goto errout;
@@ -3580,7 +3569,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
u32 portid, const struct nlmsghdr *nlh)
{
- unsigned int old_flags;
+ unsigned int old_flags, changed;
int err;
old_flags = dev->flags;
@@ -3591,12 +3580,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
return err;
}
- if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
- } else {
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
+ changed = old_flags ^ dev->flags;
+ if (dev->rtnl_link_initializing) {
+ dev->rtnl_link_initializing = false;
+ changed = ~0U;
}
+
+ __dev_notify_flags(dev, old_flags, changed, portid, nlh);
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
@@ -3654,7 +3644,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
- dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+ dev->rtnl_link_initializing = true;
if (tb[IFLA_MTU]) {
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
@@ -4083,7 +4073,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
struct ifinfomsg *ifm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for get link");
return -EINVAL;
}
@@ -4092,7 +4083,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
@@ -4883,12 +4873,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_flags || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
@@ -5051,12 +5041,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
@@ -5323,12 +5313,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
@@ -6220,7 +6210,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
{
struct if_stats_msg *ifsm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
+ ifsm = nlmsg_payload(nlh, sizeof(*ifsm));
+ if (!ifsm) {
NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
return -EINVAL;
}
@@ -6228,8 +6219,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
if (!strict_check)
return 0;
- ifsm = nlmsg_data(nlh);
-
/* only requests using strict checks can pass data to influence
* the dump. The legacy exception is filter_mask.
*/
@@ -6457,12 +6446,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
{
struct br_port_msg *bpm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
+ bpm = nlmsg_payload(nlh, sizeof(*bpm));
+ if (!bpm) {
NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
return -EINVAL;
}
- bpm = nlmsg_data(nlh);
if (bpm->ifindex) {
NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
return -EINVAL;
diff --git a/net/core/scm.c b/net/core/scm.c
index 733c0cbd393d..0225bd94170f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -404,3 +404,125 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct lsm_context ctx;
+ int err;
+
+ if (sk->sk_scm_security) {
+ err = security_secid_to_secctx(scm->secid, &ctx);
+
+ if (err >= 0) {
+ put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+ ctx.context);
+
+ security_release_secctx(&ctx);
+ }
+ }
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return sk->sk_scm_security;
+}
+#else
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return false;
+}
+#endif
+
+static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct file *pidfd_file = NULL;
+ int len, pidfd;
+
+ /* put_cmsg() doesn't return an error if CMSG is truncated,
+ * that's why we need to opencode these checks here.
+ */
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
+ len = sizeof(struct compat_cmsghdr) + sizeof(int);
+ else
+ len = sizeof(struct cmsghdr) + sizeof(int);
+
+ if (msg->msg_controllen < len) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+
+ if (!scm->pid)
+ return;
+
+ pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
+
+ if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+ if (pidfd_file) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+ }
+
+ return;
+ }
+
+ if (pidfd_file)
+ fd_install(pidfd, pidfd_file);
+}
+
+static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!msg->msg_control) {
+ if (sk->sk_scm_credentials || sk->sk_scm_pidfd ||
+ scm->fp || scm_has_secdata(sk))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ scm_destroy(scm);
+ return false;
+ }
+
+ if (sk->sk_scm_credentials) {
+ struct user_namespace *current_ns = current_user_ns();
+ struct ucred ucreds = {
+ .pid = scm->creds.pid,
+ .uid = from_kuid_munged(current_ns, scm->creds.uid),
+ .gid = from_kgid_munged(current_ns, scm->creds.gid),
+ };
+
+ put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+ }
+
+ scm_passec(sk, msg, scm);
+
+ if (scm->fp)
+ scm_detach_fds(msg, scm);
+
+ return true;
+}
+
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ scm_destroy_cred(scm);
+}
+EXPORT_SYMBOL(scm_recv);
+
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ if (sock->sk->sk_scm_pidfd)
+ scm_pidfd_recv(msg, scm);
+
+ scm_destroy_cred(scm);
+}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 568779d5a0ef..9a3965680451 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-
-#if IS_ENABLED(CONFIG_IP_DCCP)
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
-{
- u64 seq;
- net_secret_init();
- seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccp_sequence_number);
-
-#if IS_ENABLED(CONFIG_IPV6)
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- __be16 sport;
- __be16 dport;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- .sport = sport,
- .dport = dport
- };
- u64 seq;
- net_secret_init();
- seq = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-#endif
-#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6cbf77bc61fc..85fc82f72d26 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -64,6 +64,7 @@
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -89,6 +90,7 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
#include "netmem_priv.h"
#include "sock_destructor.h"
@@ -893,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_netmem(netmem_ref netmem)
-{
- return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
-}
-
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -995,14 +992,7 @@ bool napi_pp_put_page(netmem_ref netmem)
{
netmem = netmem_compound_head(netmem);
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely(!is_pp_netmem(netmem)))
+ if (unlikely(!netmem_is_pp(netmem)))
return false;
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
@@ -1042,7 +1032,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
for (i = 0; i < shinfo->nr_frags; i++) {
head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
- if (likely(is_pp_netmem(head_netmem)))
+ if (likely(netmem_is_pp(head_netmem)))
page_pool_ref_netmem(head_netmem);
else
page_ref_inc(netmem_to_page(head_netmem));
@@ -1666,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1681,7 +1672,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
@@ -1704,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1734,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1749,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1853,7 +1845,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
int err, orig_len = skb->len;
@@ -1872,7 +1865,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return -EEXIST;
}
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -3239,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg)
+ int len, sendmsg_func sendmsg, int flags)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -3257,7 +3251,7 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_flags = MSG_DONTWAIT | flags;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3294,7 +3288,8 @@ do_frag_list:
while (slen) {
struct bio_vec bvec;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
};
bvec_set_page(&bvec, skb_frag_page(frag), slen,
@@ -3340,14 +3335,21 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
/**
@@ -3443,8 +3445,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -3455,8 +3456,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
- skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3486,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = INDIRECT_CALL_1(ops->update,
- csum_partial_ext,
- vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = INDIRECT_CALL_1(ops->combine,
- csum_block_add_ext, csum,
- csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -3513,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
- csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3528,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -3632,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
@@ -3691,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_checksum_complete);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
-
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
-
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -7317,3 +7346,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/sock.c b/net/core/sock.c
index 1d9466a1f54e..341979874459 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1222,15 +1222,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
return 0;
}
return -EPERM;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- return 0;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- return 0;
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- return 0;
case SO_TYPE:
case SO_PROTOCOL:
case SO_DOMAIN:
@@ -1278,6 +1269,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
return 0;
}
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
if (val < -1 || val > 1)
return -EINVAL;
if ((u8)val == SOCK_TXREHASH_DEFAULT)
@@ -1559,6 +1552,33 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
+ case SO_PASSCRED:
+ if (sk_may_scm_recv(sk))
+ sk->sk_scm_credentials = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSSEC:
+ if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
+ sk->sk_scm_security = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSPIDFD:
+ if (sk_is_unix(sk))
+ sk->sk_scm_pidfd = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (sk_is_unix(sk))
+ sk->sk_scm_rights = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
@@ -1855,11 +1875,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSCRED:
- v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ if (!sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_credentials;
break;
case SO_PASSPIDFD:
- v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_pidfd;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_rights;
break;
case SO_PEERCRED:
@@ -1964,7 +1997,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSSEC:
- v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_security;
break;
case SO_PEERSEC:
@@ -2112,6 +2148,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+
/* Paired with WRITE_ONCE() in sk_setsockopt() */
v.val = READ_ONCE(sk->sk_txrehash);
break;
@@ -2504,17 +2543,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
if (!is_charged)
RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
+
+ goto free;
}
+
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ if (bpf_sk_storage_clone(sk, newsk))
+ goto free;
/* Clear sk_user_data if parent had the pointer tagged
* as not suitable for copying when cloning.
@@ -2544,18 +2580,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
net_enable_timestamp();
out:
return newsk;
-}
-EXPORT_SYMBOL_GPL(sk_clone_lock);
-
-void sk_free_unlock_clone(struct sock *sk)
-{
+free:
/* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- sk->sk_destruct = NULL;
- bh_unlock_sock(sk);
- sk_free(sk);
+ * destructor and make plain sk_free()
+ */
+ newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
}
-EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
@@ -3032,6 +3067,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
return -EPERM;
sockc->priority = *(u32 *)CMSG_DATA(cmsg);
break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
@@ -4014,7 +4054,7 @@ static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
- if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
pr_err("PROTO_INUSE_NR exhausted\n");
return -ENOSPC;
}
@@ -4025,7 +4065,7 @@ static int assign_proto_idx(struct proto *prot)
static void release_proto_idx(struct proto *prot)
{
- if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ if (prot->inuse_idx != PROTO_INUSE_NR)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a08eed9b9142..b23594c767f2 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
- case DCCPDIAG_GETSOCK:
-
if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c7769ee0d9c5..5dbb2c6f371d 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -201,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
if (orig_sock_table) {
static_branch_dec(&rps_needed);
static_branch_dec(&rfs_needed);
- kvfree_rcu_mightsleep(orig_sock_table);
+ kvfree_rcu(orig_sock_table, rcu);
}
}
}
@@ -239,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- kfree_rcu_mightsleep(cur);
+ kfree_rcu(cur, rcu);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -248,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
ret = -ENOMEM;
goto write_unlock;
}
- cur->num_buckets = netdev_flow_limit_table_len;
+ cur->log_buckets = ilog2(netdev_flow_limit_table_len);
rcu_assign_pointer(sd->flow_limit, cur);
}
}
diff --git a/net/core/utils.c b/net/core/utils.c
index 27f4cffaae05..e47feeaa5a49 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
-bool inet_addr_is_any(struct sockaddr *addr)
+bool inet_addr_is_any(struct sockaddr_storage *addr)
{
- if (addr->sa_family == AF_INET6) {
+ if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
const struct sockaddr_in6 in6_any =
{ .sin6_addr = IN6ADDR_ANY_INIT };
@@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr)
if (!memcmp(in6->sin6_addr.s6_addr,
in6_any.sin6_addr.s6_addr, 16))
return true;
- } else if (addr->sa_family == AF_INET) {
+ } else if (addr->ss_family == AF_INET) {
struct sockaddr_in *in = (struct sockaddr_in *)addr;
if (in->sin_addr.s_addr == htonl(INADDR_ANY))
return true;
} else {
- pr_warn("unexpected address family %u\n", addr->sa_family);
+ pr_warn("unexpected address family %u\n", addr->ss_family);
}
return false;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f86eedad586a..491334b9b8be 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -17,6 +17,7 @@
#include <net/page_pool/helpers.h>
#include <net/hotdata.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
@@ -437,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
- /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
- * as mem->type knows this a page_pool page
+ /* No need to check netmem_is_pp() as mem->type knows this a
+ * page_pool page
*/
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
napi_direct);
@@ -697,23 +698,23 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
nr_frags = xinfo->nr_frags;
for (u32 i = 0; i < nr_frags; i++) {
- u32 len = skb_frag_size(&xinfo->frags[i]);
+ const skb_frag_t *frag = &xinfo->frags[i];
+ u32 len = skb_frag_size(frag);
u32 offset, truesize = len;
- netmem_ref netmem;
+ struct page *page;
- netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize);
- if (unlikely(!netmem)) {
+ page = page_pool_dev_alloc(pp, &offset, &truesize);
+ if (unlikely(!page)) {
sinfo->nr_frags = i;
return false;
}
- memcpy(__netmem_address(netmem),
- __netmem_address(xinfo->frags[i].netmem),
+ memcpy(page_address(page) + offset, skb_frag_address(frag),
LARGEST_ALIGN(len));
- __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len);
+ __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
tsize += truesize;
- pfmemalloc |= netmem_is_pfmemalloc(netmem);
+ pfmemalloc |= page_is_pfmemalloc(page);
}
xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
@@ -737,25 +738,27 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
*/
struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
{
- struct page_pool *pp = this_cpu_read(system_page_pool);
const struct xdp_rxq_info *rxq = xdp->rxq;
u32 len = xdp->data_end - xdp->data_meta;
u32 truesize = xdp->frame_sz;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ struct page_pool *pp;
int metalen;
void *data;
if (!IS_ENABLED(CONFIG_PAGE_POOL))
return NULL;
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
data = page_pool_dev_alloc_va(pp, &truesize);
if (unlikely(!data))
- return NULL;
+ goto out;
skb = napi_build_skb(data, truesize);
if (unlikely(!skb)) {
page_pool_free_va(pp, data, true);
- return NULL;
+ goto out;
}
skb_mark_for_recycle(skb);
@@ -774,13 +777,16 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
if (unlikely(xdp_buff_has_frags(xdp)) &&
unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
napi_consume_skb(skb, true);
- return NULL;
+ skb = NULL;
+ goto out;
}
xsk_buff_free(xdp);
skb->protocol = eth_type_trans(skb, rxq->dev);
+out:
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
return skb;
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
@@ -991,34 +997,60 @@ static int __init xdp_metadata_init(void)
}
late_initcall(xdp_metadata_init);
-void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
{
val &= NETDEV_XDP_ACT_MASK;
if (dev->xdp_features == val)
return;
+ netdev_assert_locked_or_invisible(dev);
dev->xdp_features = val;
if (dev->reg_state == NETREG_REGISTERED)
call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);
+
+void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+{
+ netdev_lock(dev);
+ xdp_set_features_flag_locked(dev, val);
+ netdev_unlock(dev);
+}
EXPORT_SYMBOL_GPL(xdp_set_features_flag);
-void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg)
{
xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);
if (support_sg)
val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ netdev_lock(dev);
+ xdp_features_set_redirect_target_locked(dev, support_sg);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
-void xdp_features_clear_redirect_target(struct net_device *dev)
+void xdp_features_clear_redirect_target_locked(struct net_device *dev)
{
xdp_features_t val = dev->xdp_features;
val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ netdev_lock(dev);
+ xdp_features_clear_redirect_target_locked(dev);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
deleted file mode 100644
index 0c7d2f66ba27..000000000000
--- a/net/dccp/Kconfig
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-menuconfig IP_DCCP
- tristate "The DCCP Protocol"
- depends on INET
- help
- Datagram Congestion Control Protocol (RFC 4340)
-
- From https://www.ietf.org/rfc/rfc4340.txt:
-
- The Datagram Congestion Control Protocol (DCCP) is a transport
- protocol that implements bidirectional, unicast connections of
- congestion-controlled, unreliable datagrams. It should be suitable
- for use by applications such as streaming media, Internet telephony,
- and on-line games.
-
- To compile this protocol support as a module, choose M here: the
- module will be called dccp.
-
- If in doubt, say N.
-
-if IP_DCCP
-
-config INET_DCCP_DIAG
- depends on INET_DIAG
- def_tristate y if (IP_DCCP = y && INET_DIAG = y)
- def_tristate m
-
-source "net/dccp/ccids/Kconfig"
-
-menu "DCCP Kernel Hacking"
- depends on DEBUG_KERNEL=y
-
-config IP_DCCP_DEBUG
- bool "DCCP debug messages"
- help
- Only use this if you're hacking DCCP.
-
- When compiling DCCP as a module, this debugging output can be toggled
- by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
-
- Just say N.
-
-
-endmenu
-
-endif # IP_DDCP
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
deleted file mode 100644
index 5b4ff37bc806..000000000000
--- a/net/dccp/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
- qpolicy.o
-#
-# CCID algorithms to be used by dccp.ko
-#
-# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
-dccp-y += ccids/ccid2.o ackvec.o
-dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
-dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
- ccids/lib/tfrc_equation.o \
- ccids/lib/packet_history.o \
- ccids/lib/loss_interval.o
-
-dccp_ipv4-y := ipv4.o
-
-# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
-obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
-dccp_ipv6-y := ipv6.o
-
-obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-
-dccp-$(CONFIG_SYSCTL) += sysctl.o
-
-dccp_diag-y := diag.o
-
-# build with local directory for trace.h
-CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
deleted file mode 100644
index 1cba001bb4c8..000000000000
--- a/net/dccp/ackvec.c
+++ /dev/null
@@ -1,403 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/ackvec.c
- *
- * An implementation of Ack Vectors for the DCCP protocol
- * Copyright (c) 2007 University of Aberdeen, Scotland, UK
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- */
-#include "dccp.h"
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-static struct kmem_cache *dccp_ackvec_slab;
-static struct kmem_cache *dccp_ackvec_record_slab;
-
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
- struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
-
- if (av != NULL) {
- av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
- INIT_LIST_HEAD(&av->av_records);
- }
- return av;
-}
-
-static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
-{
- struct dccp_ackvec_record *cur, *next;
-
- list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
- kmem_cache_free(dccp_ackvec_record_slab, cur);
- INIT_LIST_HEAD(&av->av_records);
-}
-
-void dccp_ackvec_free(struct dccp_ackvec *av)
-{
- if (likely(av != NULL)) {
- dccp_ackvec_purge_records(av);
- kmem_cache_free(dccp_ackvec_slab, av);
- }
-}
-
-/**
- * dccp_ackvec_update_records - Record information about sent Ack Vectors
- * @av: Ack Vector records to update
- * @seqno: Sequence number of the packet carrying the Ack Vector just sent
- * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
- */
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
-{
- struct dccp_ackvec_record *avr;
-
- avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
- if (avr == NULL)
- return -ENOBUFS;
-
- avr->avr_ack_seqno = seqno;
- avr->avr_ack_ptr = av->av_buf_head;
- avr->avr_ack_ackno = av->av_buf_ackno;
- avr->avr_ack_nonce = nonce_sum;
- avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
- /*
- * When the buffer overflows, we keep no more than one record. This is
- * the simplest way of disambiguating sender-Acks dating from before the
- * overflow from sender-Acks which refer to after the overflow; a simple
- * solution is preferable here since we are handling an exception.
- */
- if (av->av_overflow)
- dccp_ackvec_purge_records(av);
- /*
- * Since GSS is incremented for each packet, the list is automatically
- * arranged in descending order of @ack_seqno.
- */
- list_add(&avr->avr_node, &av->av_records);
-
- dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
- (unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno,
- avr->avr_ack_runlen);
- return 0;
-}
-
-static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
- /*
- * Exploit that records are inserted in descending order of sequence
- * number, start with the oldest record first. If @ackno is `before'
- * the earliest ack_ackno, the packet is too old to be considered.
- */
- list_for_each_entry_reverse(avr, av_list, avr_node) {
- if (avr->avr_ack_seqno == ackno)
- return avr;
- if (before48(ackno, avr->avr_ack_seqno))
- break;
- }
- return NULL;
-}
-
-/*
- * Buffer index and length computation using modulo-buffersize arithmetic.
- * Note that, as pointers move from right to left, head is `before' tail.
- */
-static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
-{
- return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
-}
-
-static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
-{
- return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
-}
-
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
-{
- if (unlikely(av->av_overflow))
- return DCCPAV_MAX_ACKVEC_LEN;
- return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
-}
-
-/**
- * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
- * @av: non-empty buffer to update
- * @distance: negative or zero distance of @seqno from buf_ackno downward
- * @seqno: the (old) sequence number whose record is to be updated
- * @state: state in which packet carrying @seqno was received
- */
-static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
- u64 seqno, enum dccp_ackvec_states state)
-{
- u16 ptr = av->av_buf_head;
-
- BUG_ON(distance > 0);
- if (unlikely(dccp_ackvec_is_empty(av)))
- return;
-
- do {
- u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
-
- if (distance + runlen >= 0) {
- /*
- * Only update the state if packet has not been received
- * yet. This is OK as per the second table in RFC 4340,
- * 11.4.1; i.e. here we are using the following table:
- * RECEIVED
- * 0 1 3
- * S +---+---+---+
- * T 0 | 0 | 0 | 0 |
- * O +---+---+---+
- * R 1 | 1 | 1 | 1 |
- * E +---+---+---+
- * D 3 | 0 | 1 | 3 |
- * +---+---+---+
- * The "Not Received" state was set by reserve_seats().
- */
- if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
- av->av_buf[ptr] = state;
- else
- dccp_pr_debug("Not changing %llu state to %u\n",
- (unsigned long long)seqno, state);
- break;
- }
-
- distance += runlen + 1;
- ptr = __ackvec_idx_add(ptr, 1);
-
- } while (ptr != av->av_buf_tail);
-}
-
-/* Mark @num entries after buf_head as "Not yet received". */
-static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
-{
- u16 start = __ackvec_idx_add(av->av_buf_head, 1),
- len = DCCPAV_MAX_ACKVEC_LEN - start;
-
- /* check for buffer wrap-around */
- if (num > len) {
- memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
- start = 0;
- num -= len;
- }
- if (num)
- memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
-}
-
-/**
- * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
- * @av: container of buffer to update (can be empty or non-empty)
- * @num_packets: number of packets to register (must be >= 1)
- * @seqno: sequence number of the first packet in @num_packets
- * @state: state in which packet carrying @seqno was received
- */
-static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
- u64 seqno, enum dccp_ackvec_states state)
-{
- u32 num_cells = num_packets;
-
- if (num_packets > DCCPAV_BURST_THRESH) {
- u32 lost_packets = num_packets - 1;
-
- DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
- /*
- * We received 1 packet and have a loss of size "num_packets-1"
- * which we squeeze into num_cells-1 rather than reserving an
- * entire byte for each lost packet.
- * The reason is that the vector grows in O(burst_length); when
- * it grows too large there will no room left for the payload.
- * This is a trade-off: if a few packets out of the burst show
- * up later, their state will not be changed; it is simply too
- * costly to reshuffle/reallocate/copy the buffer each time.
- * Should such problems persist, we will need to switch to a
- * different underlying data structure.
- */
- for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
- u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN);
-
- av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
- av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
-
- lost_packets -= len;
- }
- }
-
- if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
- DCCP_CRIT("Ack Vector buffer overflow: dropping old entries");
- av->av_overflow = true;
- }
-
- av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
- if (av->av_overflow)
- av->av_buf_tail = av->av_buf_head;
-
- av->av_buf[av->av_buf_head] = state;
- av->av_buf_ackno = seqno;
-
- if (num_packets > 1)
- dccp_ackvec_reserve_seats(av, num_packets - 1);
-}
-
-/**
- * dccp_ackvec_input - Register incoming packet in the buffer
- * @av: Ack Vector to register packet to
- * @skb: Packet to register
- */
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
-{
- u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- enum dccp_ackvec_states state = DCCPAV_RECEIVED;
-
- if (dccp_ackvec_is_empty(av)) {
- dccp_ackvec_add_new(av, 1, seqno, state);
- av->av_tail_ackno = seqno;
-
- } else {
- s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
- u8 *current_head = av->av_buf + av->av_buf_head;
-
- if (num_packets == 1 &&
- dccp_ackvec_state(current_head) == state &&
- dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
-
- *current_head += 1;
- av->av_buf_ackno = seqno;
-
- } else if (num_packets > 0) {
- dccp_ackvec_add_new(av, num_packets, seqno, state);
- } else {
- dccp_ackvec_update_old(av, num_packets, seqno, state);
- }
- }
-}
-
-/**
- * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
- * @av: Ack Vector record to clean
- * @ackno: last Ack Vector which has been acknowledged
- *
- * This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on section A.3 of RFC 4340, here
- * are additional precautions to prevent corrupted buffer state. In particular,
- * we use tail_ackno to identify outdated records; it always marks the earliest
- * packet of group (2) in 11.4.2.
- */
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
-{
- struct dccp_ackvec_record *avr, *next;
- u8 runlen_now, eff_runlen;
- s64 delta;
-
- avr = dccp_ackvec_lookup(&av->av_records, ackno);
- if (avr == NULL)
- return;
- /*
- * Deal with outdated acknowledgments: this arises when e.g. there are
- * several old records and the acks from the peer come in slowly. In
- * that case we may still have records that pre-date tail_ackno.
- */
- delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
- if (delta < 0)
- goto free_records;
- /*
- * Deal with overlapping Ack Vectors: don't subtract more than the
- * number of packets between tail_ackno and ack_ackno.
- */
- eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
-
- runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
- /*
- * The run length of Ack Vector cells does not decrease over time. If
- * the run length is the same as at the time the Ack Vector was sent, we
- * free the ack_ptr cell. That cell can however not be freed if the run
- * length has increased: in this case we need to move the tail pointer
- * backwards (towards higher indices), to its next-oldest neighbour.
- */
- if (runlen_now > eff_runlen) {
-
- av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
- av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
-
- /* This move may not have cleared the overflow flag. */
- if (av->av_overflow)
- av->av_overflow = (av->av_buf_head == av->av_buf_tail);
- } else {
- av->av_buf_tail = avr->avr_ack_ptr;
- /*
- * We have made sure that avr points to a valid cell within the
- * buffer. This cell is either older than head, or equals head
- * (empty buffer): in both cases we no longer have any overflow.
- */
- av->av_overflow = 0;
- }
-
- /*
- * The peer has acknowledged up to and including ack_ackno. Hence the
- * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
- */
- av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
-
-free_records:
- list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
- list_del(&avr->avr_node);
- kmem_cache_free(dccp_ackvec_record_slab, avr);
- }
-}
-
-/*
- * Routines to keep track of Ack Vectors received in an skb
- */
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
-{
- struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
-
- if (new == NULL)
- return -ENOBUFS;
- new->vec = vec;
- new->len = len;
- new->nonce = nonce;
-
- list_add_tail(&new->node, head);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
-
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
-{
- struct dccp_ackvec_parsed *cur, *next;
-
- list_for_each_entry_safe(cur, next, parsed_chunks, node)
- kfree(cur);
- INIT_LIST_HEAD(parsed_chunks);
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
-
-int __init dccp_ackvec_init(void)
-{
- dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN);
- if (dccp_ackvec_slab == NULL)
- goto out_err;
-
- dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN);
- if (dccp_ackvec_record_slab == NULL)
- goto out_destroy_slab;
-
- return 0;
-
-out_destroy_slab:
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
-out_err:
- DCCP_CRIT("Unable to create Ack Vector slab cache");
- return -ENOBUFS;
-}
-
-void dccp_ackvec_exit(void)
-{
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
- kmem_cache_destroy(dccp_ackvec_record_slab);
- dccp_ackvec_record_slab = NULL;
-}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
deleted file mode 100644
index d2c4220fb377..000000000000
--- a/net/dccp/ackvec.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _ACKVEC_H
-#define _ACKVEC_H
-/*
- * net/dccp/ackvec.h
- *
- * An implementation of Ack Vectors for the DCCP protocol
- * Copyright (c) 2007 University of Aberdeen, Scotland, UK
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- */
-
-#include <linux/dccp.h>
-#include <linux/compiler.h>
-#include <linux/list.h>
-#include <linux/types.h>
-
-/*
- * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
- * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
- * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
- * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
- * The maximum value is bounded by the u16 types for indices and functions.
- */
-#define DCCPAV_NUM_ACKVECS 2
-#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
-
-/* Estimated minimum average Ack Vector length - used for updating MPS */
-#define DCCPAV_MIN_OPTLEN 16
-
-/* Threshold for coping with large bursts of losses */
-#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
-
-enum dccp_ackvec_states {
- DCCPAV_RECEIVED = 0x00,
- DCCPAV_ECN_MARKED = 0x40,
- DCCPAV_RESERVED = 0x80,
- DCCPAV_NOT_RECEIVED = 0xC0
-};
-#define DCCPAV_MAX_RUNLEN 0x3F
-
-static inline u8 dccp_ackvec_runlen(const u8 *cell)
-{
- return *cell & DCCPAV_MAX_RUNLEN;
-}
-
-static inline u8 dccp_ackvec_state(const u8 *cell)
-{
- return *cell & ~DCCPAV_MAX_RUNLEN;
-}
-
-/**
- * struct dccp_ackvec - Ack Vector main data structure
- *
- * This implements a fixed-size circular buffer within an array and is largely
- * based on Appendix A of RFC 4340.
- *
- * @av_buf: circular buffer storage area
- * @av_buf_head: head index; begin of live portion in @av_buf
- * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
- * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
- * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
- * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
- * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
- * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
- * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
- */
-struct dccp_ackvec {
- u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
- u16 av_buf_head;
- u16 av_buf_tail;
- u64 av_buf_ackno:48;
- u64 av_tail_ackno:48;
- bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
- u8 av_overflow:1;
- struct list_head av_records;
-};
-
-/**
- * struct dccp_ackvec_record - Records information about sent Ack Vectors
- *
- * These list entries define the additional information which the HC-Receiver
- * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
- *
- * @avr_node: the list node in @av_records
- * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
- * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
- * @avr_ack_ptr: pointer into @av_buf where this record starts
- * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
- * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
- *
- * The list as a whole is sorted in descending order by @avr_ack_seqno.
- */
-struct dccp_ackvec_record {
- struct list_head avr_node;
- u64 avr_ack_seqno:48;
- u64 avr_ack_ackno:48;
- u16 avr_ack_ptr;
- u8 avr_ack_runlen;
- u8 avr_ack_nonce:1;
-};
-
-int dccp_ackvec_init(void);
-void dccp_ackvec_exit(void);
-
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
-void dccp_ackvec_free(struct dccp_ackvec *av);
-
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
-
-static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
-{
- return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
-}
-
-/**
- * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
- * @vec: start of vector (offset into skb)
- * @len: length of @vec
- * @nonce: whether @vec had an ECN nonce of 0 or 1
- * @node: FIFO - arranged in descending order of ack_ackno
- *
- * This structure is used by CCIDs to access Ack Vectors in a received skb.
- */
-struct dccp_ackvec_parsed {
- u8 *vec,
- len,
- nonce:1;
- struct list_head node;
-};
-
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
-#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
deleted file mode 100644
index 6beac5d348e2..000000000000
--- a/net/dccp/ccid.c
+++ /dev/null
@@ -1,219 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/ccid.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- */
-
-#include <linux/slab.h>
-
-#include "ccid.h"
-#include "ccids/lib/tfrc.h"
-
-static struct ccid_operations *ccids[] = {
- &ccid2_ops,
-#ifdef CONFIG_IP_DCCP_CCID3
- &ccid3_ops,
-#endif
-};
-
-static struct ccid_operations *ccid_by_number(const u8 id)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++)
- if (ccids[i]->ccid_id == id)
- return ccids[i];
- return NULL;
-}
-
-/* check that up to @array_len members in @ccid_array are supported */
-bool ccid_support_check(u8 const *ccid_array, u8 array_len)
-{
- while (array_len > 0)
- if (ccid_by_number(ccid_array[--array_len]) == NULL)
- return false;
- return true;
-}
-
-/**
- * ccid_get_builtin_ccids - Populate a list of built-in CCIDs
- * @ccid_array: pointer to copy into
- * @array_len: value to return length into
- *
- * This function allocates memory - caller must see that it is freed after use.
- */
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
-{
- *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
- if (*ccid_array == NULL)
- return -ENOBUFS;
-
- for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
- (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
- return 0;
-}
-
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
-{
- u8 *ccid_array, array_len;
- int err = 0;
-
- if (ccid_get_builtin_ccids(&ccid_array, &array_len))
- return -ENOBUFS;
-
- if (put_user(array_len, optlen))
- err = -EFAULT;
- else if (len > 0 && copy_to_user(optval, ccid_array,
- len > array_len ? array_len : len))
- err = -EFAULT;
-
- kfree(ccid_array);
- return err;
-}
-
-static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
-{
- struct kmem_cache *slab;
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
- va_end(args);
-
- slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL);
- return slab;
-}
-
-static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
-{
- kmem_cache_destroy(slab);
-}
-
-static int __init ccid_activate(struct ccid_operations *ccid_ops)
-{
- int err = -ENOBUFS;
-
- ccid_ops->ccid_hc_rx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
- ccid_ops->ccid_hc_rx_slab_name,
- "ccid%u_hc_rx_sock",
- ccid_ops->ccid_id);
- if (ccid_ops->ccid_hc_rx_slab == NULL)
- goto out;
-
- ccid_ops->ccid_hc_tx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
- ccid_ops->ccid_hc_tx_slab_name,
- "ccid%u_hc_tx_sock",
- ccid_ops->ccid_id);
- if (ccid_ops->ccid_hc_tx_slab == NULL)
- goto out_free_rx_slab;
-
- pr_info("DCCP: Activated CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
- err = 0;
-out:
- return err;
-out_free_rx_slab:
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
- goto out;
-}
-
-static void ccid_deactivate(struct ccid_operations *ccid_ops)
-{
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
- ccid_ops->ccid_hc_tx_slab = NULL;
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
-
- pr_info("DCCP: Deactivated CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
-}
-
-struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
-{
- struct ccid_operations *ccid_ops = ccid_by_number(id);
- struct ccid *ccid = NULL;
-
- if (ccid_ops == NULL)
- goto out;
-
- ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, gfp_any());
- if (ccid == NULL)
- goto out;
- ccid->ccid_ops = ccid_ops;
- if (rx) {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
- if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
- ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
- goto out_free_ccid;
- } else {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
- if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
- ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
- goto out_free_ccid;
- }
-out:
- return ccid;
-out_free_ccid:
- kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, ccid);
- ccid = NULL;
- goto out;
-}
-
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
-{
- if (ccid != NULL) {
- if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
- ccid->ccid_ops->ccid_hc_rx_exit(sk);
- kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
- }
-}
-
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
-{
- if (ccid != NULL) {
- if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
- ccid->ccid_ops->ccid_hc_tx_exit(sk);
- kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
- }
-}
-
-int __init ccid_initialize_builtins(void)
-{
- int i, err = tfrc_lib_init();
-
- if (err)
- return err;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++) {
- err = ccid_activate(ccids[i]);
- if (err)
- goto unwind_registrations;
- }
- return 0;
-
-unwind_registrations:
- while(--i >= 0)
- ccid_deactivate(ccids[i]);
- tfrc_lib_exit();
- return err;
-}
-
-void ccid_cleanup_builtins(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++)
- ccid_deactivate(ccids[i]);
- tfrc_lib_exit();
-}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
deleted file mode 100644
index 105f3734dadb..000000000000
--- a/net/dccp/ccid.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _CCID_H
-#define _CCID_H
-/*
- * net/dccp/ccid.h
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- */
-
-#include <net/sock.h>
-#include <linux/compiler.h>
-#include <linux/dccp.h>
-#include <linux/list.h>
-#include <linux/module.h>
-
-/* maximum value for a CCID (RFC 4340, 19.5) */
-#define CCID_MAX 255
-#define CCID_SLAB_NAME_LENGTH 32
-
-struct tcp_info;
-
-/**
- * struct ccid_operations - Interface to Congestion-Control Infrastructure
- *
- * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
- * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
- * @ccid_name: alphabetical identifier string for @ccid_id
- * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
- * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
- *
- * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
- * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
- * @ccid_hc_rx_packet_recv: implements the HC-receiver side
- * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
- * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
- * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
- * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
- * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
- * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
- * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
- */
-struct ccid_operations {
- unsigned char ccid_id;
- __u32 ccid_ccmps;
- const char *ccid_name;
- struct kmem_cache *ccid_hc_rx_slab,
- *ccid_hc_tx_slab;
- char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
- char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
- __u32 ccid_hc_rx_obj_size,
- ccid_hc_tx_obj_size;
- /* Interface Routines */
- int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
- int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
- void (*ccid_hc_rx_exit)(struct sock *sk);
- void (*ccid_hc_tx_exit)(struct sock *sk);
- void (*ccid_hc_rx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
- u8 opt, u8 *val, u8 len);
- int (*ccid_hc_rx_insert_options)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
- u8 opt, u8 *val, u8 len);
- int (*ccid_hc_tx_send_packet)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- unsigned int len);
- void (*ccid_hc_rx_get_info)(struct sock *sk,
- struct tcp_info *info);
- void (*ccid_hc_tx_get_info)(struct sock *sk,
- struct tcp_info *info);
- int (*ccid_hc_rx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
- int (*ccid_hc_tx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
-};
-
-extern struct ccid_operations ccid2_ops;
-#ifdef CONFIG_IP_DCCP_CCID3
-extern struct ccid_operations ccid3_ops;
-#endif
-
-int ccid_initialize_builtins(void);
-void ccid_cleanup_builtins(void);
-
-struct ccid {
- struct ccid_operations *ccid_ops;
- char ccid_priv[];
-};
-
-static inline void *ccid_priv(const struct ccid *ccid)
-{
- return (void *)ccid->ccid_priv;
-}
-
-bool ccid_support_check(u8 const *ccid_array, u8 array_len);
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
- char __user *, int __user *);
-
-struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
-
-static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
-{
- struct ccid *ccid = dp->dccps_hc_rx_ccid;
-
- if (ccid == NULL || ccid->ccid_ops == NULL)
- return -1;
- return ccid->ccid_ops->ccid_id;
-}
-
-static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
-{
- struct ccid *ccid = dp->dccps_hc_tx_ccid;
-
- if (ccid == NULL || ccid->ccid_ops == NULL)
- return -1;
- return ccid->ccid_ops->ccid_id;
-}
-
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
-
-/*
- * Congestion control of queued data packets via CCID decision.
- *
- * The TX CCID performs its congestion-control by indicating whether and when a
- * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
- * The following modes are supported via the symbolic constants below:
- * - timer-based pacing (CCID returns a delay value in milliseconds);
- * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
- */
-
-enum ccid_dequeueing_decision {
- CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
- CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
- CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
- CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
- CCID_PACKET_ERR = 0xF0000, /* error condition */
-};
-
-static inline int ccid_packet_dequeue_eval(const int return_code)
-{
- if (return_code < 0)
- return CCID_PACKET_ERR;
- if (return_code == 0)
- return CCID_PACKET_SEND_AT_ONCE;
- if (return_code <= CCID_PACKET_DELAY_MAX)
- return CCID_PACKET_DELAY;
- return return_code;
-}
-
-static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
- return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- unsigned int len)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
-}
-
-static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
-}
-
-static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
-}
-
-/**
- * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
- * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
- * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
- * @val: value of @opt
- * @len: length of @val in bytes
- */
-static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- u8 pkt, u8 opt, u8 *val, u8 len)
-{
- if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options)
- return 0;
- return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
-}
-
-/**
- * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
- * Arguments are analogous to ccid_hc_tx_parse_options()
- */
-static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- u8 pkt, u8 opt, u8 *val, u8 len)
-{
- if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options)
- return 0;
- return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
-}
-
-static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
- return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
- return 0;
-}
-
-static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
-}
-
-static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
-}
-
-static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-
-static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
deleted file mode 100644
index e3d388c33d25..000000000000
--- a/net/dccp/ccids/Kconfig
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-menu "DCCP CCIDs Configuration"
-
-config IP_DCCP_CCID2_DEBUG
- bool "CCID-2 debugging messages"
- help
- Enable CCID-2 specific debugging messages.
-
- The debugging output can additionally be toggled by setting the
- ccid2_debug parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3
- bool "CCID-3 (TCP-Friendly)"
- default IP_DCCP = y || IP_DCCP = m
- help
- CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
- rate-controlled congestion control mechanism. TFRC is designed to
- be reasonably fair when competing for bandwidth with TCP-like flows,
- where a flow is "reasonably fair" if its sending rate is generally
- within a factor of two of the sending rate of a TCP flow under the
- same conditions. However, TFRC has a much lower variation of
- throughput over time compared with TCP, which makes CCID-3 more
- suitable than CCID-2 for applications such streaming media where a
- relatively smooth sending rate is of importance.
-
- CCID-3 is further described in RFC 4342,
- https://www.ietf.org/rfc/rfc4342.txt
-
- The TFRC congestion control algorithms were initially described in
- RFC 5348.
-
- This text was extracted from RFC 4340 (sec. 10.2),
- https://www.ietf.org/rfc/rfc4340.txt
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3_DEBUG
- bool "CCID-3 debugging messages"
- depends on IP_DCCP_CCID3
- help
- Enable CCID-3 specific debugging messages.
-
- The debugging output can additionally be toggled by setting the
- ccid3_debug parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_TFRC_LIB
- def_bool y if IP_DCCP_CCID3
-
-config IP_DCCP_TFRC_DEBUG
- def_bool y if IP_DCCP_CCID3_DEBUG
-endmenu
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
deleted file mode 100644
index d6b30700af67..000000000000
--- a/net/dccp/ccids/ccid2.c
+++ /dev/null
@@ -1,794 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * Changes to meet Linux coding standards, and DCCP infrastructure fixes.
- *
- * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-/*
- * This implementation should follow RFC 4341
- */
-#include <linux/slab.h>
-#include "../feat.h"
-#include "ccid2.h"
-
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-static bool ccid2_debug;
-#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-#else
-#define ccid2_pr_debug(format, a...)
-#endif
-
-static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
-{
- struct ccid2_seq *seqp;
- int i;
-
- /* check if we have space to preserve the pointer to the buffer */
- if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
- sizeof(struct ccid2_seq *)))
- return -ENOMEM;
-
- /* allocate buffer and initialize linked list */
- seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq),
- gfp_any());
- if (seqp == NULL)
- return -ENOMEM;
-
- for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
- seqp[i].ccid2s_next = &seqp[i + 1];
- seqp[i + 1].ccid2s_prev = &seqp[i];
- }
- seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
- seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
-
- /* This is the first allocation. Initiate the head and tail. */
- if (hc->tx_seqbufc == 0)
- hc->tx_seqh = hc->tx_seqt = seqp;
- else {
- /* link the existing list with the one we just created */
- hc->tx_seqh->ccid2s_next = seqp;
- seqp->ccid2s_prev = hc->tx_seqh;
-
- hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
- seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
- }
-
- /* store the original pointer to the buffer so we can free it */
- hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
- hc->tx_seqbufc++;
-
- return 0;
-}
-
-static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
- return CCID_PACKET_WILL_DEQUEUE_LATER;
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
-{
- u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
-
- /*
- * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
- * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
- * acceptable since this causes starvation/deadlock whenever cwnd < 2.
- * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
- */
- if (val == 0 || val > max_ratio) {
- DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
- val = max_ratio;
- }
- dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
- min_t(u32, val, DCCPF_ACK_RATIO_MAX));
-}
-
-static void ccid2_check_l_ack_ratio(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- /*
- * After a loss, idle period, application limited period, or RTO we
- * need to check that the ack ratio is still less than the congestion
- * window. Otherwise, we will send an entire congestion window of
- * packets and got no response because we haven't sent ack ratio
- * packets yet.
- * If the ack ratio does need to be reduced, we reduce it to half of
- * the congestion window (or 1 if that's zero) instead of to the
- * congestion window. This prevents problems if one ack is lost.
- */
- if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
- ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
-}
-
-static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
-{
- dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
- clamp_val(val, DCCPF_SEQ_WMIN,
- DCCPF_SEQ_WMAX));
-}
-
-static void dccp_tasklet_schedule(struct sock *sk)
-{
- struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet;
-
- if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- sock_hold(sk);
- __tasklet_schedule(t);
- }
-}
-
-static void ccid2_hc_tx_rto_expire(struct timer_list *t)
-{
- struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer);
- struct sock *sk = hc->sk;
- const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
- goto out;
- }
-
- ccid2_pr_debug("RTO_EXPIRE\n");
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- /* back-off timer */
- hc->tx_rto <<= 1;
- if (hc->tx_rto > DCCP_RTO_MAX)
- hc->tx_rto = DCCP_RTO_MAX;
-
- /* adjust pipe, cwnd etc */
- hc->tx_ssthresh = hc->tx_cwnd / 2;
- if (hc->tx_ssthresh < 2)
- hc->tx_ssthresh = 2;
- hc->tx_cwnd = 1;
- hc->tx_pipe = 0;
-
- /* clear state about stuff we sent */
- hc->tx_seqt = hc->tx_seqh;
- hc->tx_packets_acked = 0;
-
- /* clear ack ratio state. */
- hc->tx_rpseq = 0;
- hc->tx_rpdupack = -1;
- ccid2_change_l_ack_ratio(sk, 1);
-
- /* if we were blocked before, we may now send cwnd=1 packet */
- if (sender_was_blocked)
- dccp_tasklet_schedule(sk);
- /* restart backed-off timer */
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/*
- * Congestion window validation (RFC 2861).
- */
-static bool ccid2_do_cwv = true;
-module_param(ccid2_do_cwv, bool, 0644);
-MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
-
-/**
- * ccid2_update_used_window - Track how much of cwnd is actually used
- * @hc: socket to update window
- * @new_wnd: new window values to add into the filter
- *
- * This is done in addition to CWV. The sender needs to have an idea of how many
- * packets may be in flight, to set the local Sequence Window value accordingly
- * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
- * maximum-used window. We use an EWMA low-pass filter to filter out noise.
- */
-static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
-{
- hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
-}
-
-/* This borrows the code of tcp_cwnd_application_limited() */
-static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- /* don't reduce cwnd below the initial window (IW) */
- u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
- win_used = max(hc->tx_cwnd_used, init_win);
-
- if (win_used < hc->tx_cwnd) {
- hc->tx_ssthresh = max(hc->tx_ssthresh,
- (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
- hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
- }
- hc->tx_cwnd_used = 0;
- hc->tx_cwnd_stamp = now;
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-/* This borrows the code of tcp_cwnd_restart() */
-static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- u32 cwnd = hc->tx_cwnd, restart_cwnd,
- iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
- s32 delta = now - hc->tx_lsndtime;
-
- hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
-
- /* don't reduce cwnd below the initial window (IW) */
- restart_cwnd = min(cwnd, iwnd);
-
- while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd)
- cwnd >>= 1;
- hc->tx_cwnd = max(cwnd, restart_cwnd);
- hc->tx_cwnd_stamp = now;
- hc->tx_cwnd_used = 0;
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- const u32 now = ccid2_jiffies32;
- struct ccid2_seq *next;
-
- /* slow-start after idle periods (RFC 2581, RFC 2861) */
- if (ccid2_do_cwv && !hc->tx_pipe &&
- (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
- ccid2_cwnd_restart(sk, now);
-
- hc->tx_lsndtime = now;
- hc->tx_pipe += 1;
-
- /* see whether cwnd was fully used (RFC 2861), update expected window */
- if (ccid2_cwnd_network_limited(hc)) {
- ccid2_update_used_window(hc, hc->tx_cwnd);
- hc->tx_cwnd_used = 0;
- hc->tx_cwnd_stamp = now;
- } else {
- if (hc->tx_pipe > hc->tx_cwnd_used)
- hc->tx_cwnd_used = hc->tx_pipe;
-
- ccid2_update_used_window(hc, hc->tx_cwnd_used);
-
- if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
- ccid2_cwnd_application_limited(sk, now);
- }
-
- hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
- hc->tx_seqh->ccid2s_acked = 0;
- hc->tx_seqh->ccid2s_sent = now;
-
- next = hc->tx_seqh->ccid2s_next;
- /* check if we need to alloc more space */
- if (next == hc->tx_seqt) {
- if (ccid2_hc_tx_alloc_seq(hc)) {
- DCCP_CRIT("packet history - out of memory!");
- /* FIXME: find a more graceful way to bail out */
- return;
- }
- next = hc->tx_seqh->ccid2s_next;
- BUG_ON(next == hc->tx_seqt);
- }
- hc->tx_seqh = next;
-
- ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
-
- /*
- * FIXME: The code below is broken and the variables have been removed
- * from the socket struct. The `ackloss' variable was always set to 0,
- * and with arsent there are several problems:
- * (i) it doesn't just count the number of Acks, but all sent packets;
- * (ii) it is expressed in # of packets, not # of windows, so the
- * comparison below uses the wrong formula: Appendix A of RFC 4341
- * comes up with the number K = cwnd / (R^2 - R) of consecutive windows
- * of data with no lost or marked Ack packets. If arsent were the # of
- * consecutive Acks received without loss, then Ack Ratio needs to be
- * decreased by 1 when
- * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
- * where cwnd / R is the number of Acks received per window of data
- * (cf. RFC 4341, App. A). The problems are that
- * - arsent counts other packets as well;
- * - the comparison uses a formula different from RFC 4341;
- * - computing a cubic/quadratic equation each time is too complicated.
- * Hence a different algorithm is needed.
- */
-#if 0
- /* Ack Ratio. Need to maintain a concept of how many windows we sent */
- hc->tx_arsent++;
- /* We had an ack loss in this window... */
- if (hc->tx_ackloss) {
- if (hc->tx_arsent >= hc->tx_cwnd) {
- hc->tx_arsent = 0;
- hc->tx_ackloss = 0;
- }
- } else {
- /* No acks lost up to now... */
- /* decrease ack ratio if enough packets were sent */
- if (dp->dccps_l_ack_ratio > 1) {
- /* XXX don't calculate denominator each time */
- int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
- dp->dccps_l_ack_ratio;
-
- denom = hc->tx_cwnd * hc->tx_cwnd / denom;
-
- if (hc->tx_arsent >= denom) {
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
- hc->tx_arsent = 0;
- }
- } else {
- /* we can't increase ack ratio further [1] */
- hc->tx_arsent = 0; /* or maybe set it to cwnd*/
- }
- }
-#endif
-
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
- do {
- struct ccid2_seq *seqp = hc->tx_seqt;
-
- while (seqp != hc->tx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
- (unsigned long long)seqp->ccid2s_seq,
- seqp->ccid2s_acked, seqp->ccid2s_sent);
- seqp = seqp->ccid2s_next;
- }
- } while (0);
- ccid2_pr_debug("=========\n");
-#endif
-}
-
-/**
- * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
- * @sk: socket to perform estimator on
- * @mrtt: measured RTT
- *
- * This code is almost identical with TCP's tcp_rtt_estimator(), since
- * - it has a higher sampling frequency (recommended by RFC 1323),
- * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
- * - it is simple (cf. more complex proposals such as Eifel timer or research
- * which suggests that the gain should be set according to window size),
- * - in tests it was found to work well with CCID2 [gerrit].
- */
-static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- long m = mrtt ? : 1;
-
- if (hc->tx_srtt == 0) {
- /* First measurement m */
- hc->tx_srtt = m << 3;
- hc->tx_mdev = m << 1;
-
- hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
- hc->tx_rttvar = hc->tx_mdev_max;
-
- hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
- } else {
- /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
- m -= (hc->tx_srtt >> 3);
- hc->tx_srtt += m;
-
- /* Similarly, update scaled mdev with regard to |m| */
- if (m < 0) {
- m = -m;
- m -= (hc->tx_mdev >> 2);
- /*
- * This neutralises RTO increase when RTT < SRTT - mdev
- * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
- * in Linux TCP", USENIX 2002, pp. 49-62).
- */
- if (m > 0)
- m >>= 3;
- } else {
- m -= (hc->tx_mdev >> 2);
- }
- hc->tx_mdev += m;
-
- if (hc->tx_mdev > hc->tx_mdev_max) {
- hc->tx_mdev_max = hc->tx_mdev;
- if (hc->tx_mdev_max > hc->tx_rttvar)
- hc->tx_rttvar = hc->tx_mdev_max;
- }
-
- /*
- * Decay RTTVAR at most once per flight, exploiting that
- * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
- * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
- * GAR is a useful bound for FlightSize = pipe.
- * AWL is probably too low here, as it over-estimates pipe.
- */
- if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
- if (hc->tx_mdev_max < hc->tx_rttvar)
- hc->tx_rttvar -= (hc->tx_rttvar -
- hc->tx_mdev_max) >> 2;
- hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
- hc->tx_mdev_max = tcp_rto_min(sk);
- }
- }
-
- /*
- * Set RTO from SRTT and RTTVAR
- * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
- * This agrees with RFC 4341, 5:
- * "Because DCCP does not retransmit data, DCCP does not require
- * TCP's recommended minimum timeout of one second".
- */
- hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
-
- if (hc->tx_rto > DCCP_RTO_MAX)
- hc->tx_rto = DCCP_RTO_MAX;
-}
-
-static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
- unsigned int *maxincr)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
-
- if (hc->tx_cwnd < dp->dccps_l_seq_win &&
- r_seq_used < dp->dccps_r_seq_win) {
- if (hc->tx_cwnd < hc->tx_ssthresh) {
- if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
- hc->tx_cwnd += 1;
- *maxincr -= 1;
- hc->tx_packets_acked = 0;
- }
- } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
- hc->tx_cwnd += 1;
- hc->tx_packets_acked = 0;
- }
- }
-
- /*
- * Adjust the local sequence window and the ack ratio to allow about
- * 5 times the number of packets in the network (RFC 4340 7.5.2)
- */
- if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
- else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
-
- if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
- ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
- else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
- ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
-
- /*
- * FIXME: RTT is sampled several times per acknowledgment (for each
- * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
- * This causes the RTT to be over-estimated, since the older entries
- * in the Ack Vector have earlier sending times.
- * The cleanest solution is to not use the ccid2s_sent field at all
- * and instead use DCCP timestamps: requires changes in other places.
- */
- ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent);
-}
-
-static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
- ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
- return;
- }
-
- hc->tx_last_cong = ccid2_jiffies32;
-
- hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
- hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
- u8 option, u8 *optval, u8 optlen)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- switch (option) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
- option - DCCPO_ACK_VECTOR_0);
- }
- return 0;
-}
-
-static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
- struct dccp_ackvec_parsed *avp;
- u64 ackno, seqno;
- struct ccid2_seq *seqp;
- int done = 0;
- unsigned int maxincr = 0;
-
- /* check reverse path congestion */
- seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-
- /* XXX this whole "algorithm" is broken. Need to fix it to keep track
- * of the seqnos of the dupacks so that rpseq and rpdupack are correct
- * -sorbo.
- */
- /* need to bootstrap */
- if (hc->tx_rpdupack == -1) {
- hc->tx_rpdupack = 0;
- hc->tx_rpseq = seqno;
- } else {
- /* check if packet is consecutive */
- if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
- hc->tx_rpseq = seqno;
- /* it's a later packet */
- else if (after48(seqno, hc->tx_rpseq)) {
- hc->tx_rpdupack++;
-
- /* check if we got enough dupacks */
- if (hc->tx_rpdupack >= NUMDUPACK) {
- hc->tx_rpdupack = -1; /* XXX lame */
- hc->tx_rpseq = 0;
-#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
- /*
- * FIXME: Ack Congestion Control is broken; in
- * the current state instabilities occurred with
- * Ack Ratios greater than 1; causing hang-ups
- * and long RTO timeouts. This needs to be fixed
- * before opening up dynamic changes. -- gerrit
- */
- ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
-#endif
- }
- }
- }
-
- /* check forward path congestion */
- if (dccp_packet_without_ack(skb))
- return;
-
- /* still didn't send out new data packets */
- if (hc->tx_seqh == hc->tx_seqt)
- goto done;
-
- ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- if (after48(ackno, hc->tx_high_ack))
- hc->tx_high_ack = ackno;
-
- seqp = hc->tx_seqt;
- while (before48(seqp->ccid2s_seq, ackno)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hc->tx_seqh) {
- seqp = hc->tx_seqh->ccid2s_prev;
- break;
- }
- }
-
- /*
- * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
- * packets per acknowledgement. Rounding up avoids that cwnd is not
- * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
- */
- if (hc->tx_cwnd < hc->tx_ssthresh)
- maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
-
- /* go through all ack vectors */
- list_for_each_entry(avp, &hc->tx_av_chunks, node) {
- /* go through this ack vector */
- for (; avp->len--; avp->vec++) {
- u64 ackno_end_rl = SUB48(ackno,
- dccp_ackvec_runlen(avp->vec));
-
- ccid2_pr_debug("ackvec %llu |%u,%u|\n",
- (unsigned long long)ackno,
- dccp_ackvec_state(avp->vec) >> 6,
- dccp_ackvec_runlen(avp->vec));
- /* if the seqno we are analyzing is larger than the
- * current ackno, then move towards the tail of our
- * seqnos.
- */
- while (after48(seqp->ccid2s_seq, ackno)) {
- if (seqp == hc->tx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_prev;
- }
- if (done)
- break;
-
- /* check all seqnos in the range of the vector
- * run length
- */
- while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = dccp_ackvec_state(avp->vec);
-
- /* new packet received or marked */
- if (state != DCCPAV_NOT_RECEIVED &&
- !seqp->ccid2s_acked) {
- if (state == DCCPAV_ECN_MARKED)
- ccid2_congestion_event(sk,
- seqp);
- else
- ccid2_new_ack(sk, seqp,
- &maxincr);
-
- seqp->ccid2s_acked = 1;
- ccid2_pr_debug("Got ack for %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- hc->tx_pipe--;
- }
- if (seqp == hc->tx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_prev;
- }
- if (done)
- break;
-
- ackno = SUB48(ackno_end_rl, 1);
- }
- if (done)
- break;
- }
-
- /* The state about what is acked should be correct now
- * Check for NUMDUPACK
- */
- seqp = hc->tx_seqt;
- while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hc->tx_seqh) {
- seqp = hc->tx_seqh->ccid2s_prev;
- break;
- }
- }
- done = 0;
- while (1) {
- if (seqp->ccid2s_acked) {
- done++;
- if (done == NUMDUPACK)
- break;
- }
- if (seqp == hc->tx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- /* If there are at least 3 acknowledgements, anything unacknowledged
- * below the last sequence number is considered lost
- */
- if (done == NUMDUPACK) {
- struct ccid2_seq *last_acked = seqp;
-
- /* check for lost packets */
- while (1) {
- if (!seqp->ccid2s_acked) {
- ccid2_pr_debug("Packet lost: %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- /* XXX need to traverse from tail -> head in
- * order to detect multiple congestion events in
- * one ack vector.
- */
- ccid2_congestion_event(sk, seqp);
- hc->tx_pipe--;
- }
- if (seqp == hc->tx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- hc->tx_seqt = last_acked;
- }
-
- /* trim acked packets in tail */
- while (hc->tx_seqt != hc->tx_seqh) {
- if (!hc->tx_seqt->ccid2s_acked)
- break;
-
- hc->tx_seqt = hc->tx_seqt->ccid2s_next;
- }
-
- /* restart RTO timer if not all outstanding data has been acked */
- if (hc->tx_pipe == 0)
- sk_stop_timer(sk, &hc->tx_rtotimer);
- else
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-done:
- /* check if incoming Acks allow pending packets to be sent */
- if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
- dccp_tasklet_schedule(sk);
- dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
-}
-
-static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
- struct dccp_sock *dp = dccp_sk(sk);
- u32 max_ratio;
-
- /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
- hc->tx_ssthresh = ~0U;
-
- /* Use larger initial windows (RFC 4341, section 5). */
- hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
- hc->tx_expected_wnd = hc->tx_cwnd;
-
- /* Make sure that Ack Ratio is enabled and within bounds. */
- max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
- if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
- dp->dccps_l_ack_ratio = max_ratio;
-
- /* XXX init ~ to window size... */
- if (ccid2_hc_tx_alloc_seq(hc))
- return -ENOMEM;
-
- hc->tx_rto = DCCP_TIMEOUT_INIT;
- hc->tx_rpdupack = -1;
- hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32;
- hc->tx_cwnd_used = 0;
- hc->sk = sk;
- timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0);
- INIT_LIST_HEAD(&hc->tx_av_chunks);
- return 0;
-}
-
-static void ccid2_hc_tx_exit(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- int i;
-
- sk_stop_timer(sk, &hc->tx_rtotimer);
-
- for (i = 0; i < hc->tx_seqbufc; i++)
- kfree(hc->tx_seqbuf[i]);
- hc->tx_seqbufc = 0;
- dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
-}
-
-static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
-
- if (!dccp_data_packet(skb))
- return;
-
- if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
- dccp_send_ack(sk);
- hc->rx_num_data_pkts = 0;
- }
-}
-
-struct ccid_operations ccid2_ops = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "TCP-like",
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, bool, 0644);
-MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
-#endif
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
deleted file mode 100644
index 330c7b4ec001..000000000000
--- a/net/dccp/ccids/ccid2.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- */
-#ifndef _DCCP_CCID2_H_
-#define _DCCP_CCID2_H_
-
-#include <linux/timer.h>
-#include <linux/types.h>
-#include "../ccid.h"
-#include "../dccp.h"
-
-/*
- * CCID-2 timestamping faces the same issues as TCP timestamping.
- * Hence we reuse/share as much of the code as possible.
- */
-#define ccid2_jiffies32 ((u32)jiffies)
-
-/* NUMDUPACK parameter from RFC 4341, p. 6 */
-#define NUMDUPACK 3
-
-struct ccid2_seq {
- u64 ccid2s_seq;
- u32 ccid2s_sent;
- int ccid2s_acked;
- struct ccid2_seq *ccid2s_prev;
- struct ccid2_seq *ccid2s_next;
-};
-
-#define CCID2_SEQBUF_LEN 1024
-#define CCID2_SEQBUF_MAX 128
-
-/*
- * Multiple of congestion window to keep the sequence window at
- * (RFC 4340 7.5.2)
- */
-#define CCID2_WIN_CHANGE_FACTOR 5
-
-/**
- * struct ccid2_hc_tx_sock - CCID2 TX half connection
- * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @tx_srtt: smoothed RTT estimate, scaled by 2^3
- * @tx_mdev: smoothed RTT variation, scaled by 2^2
- * @tx_mdev_max: maximum of @mdev during one flight
- * @tx_rttvar: moving average/maximum of @mdev_max
- * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
- * @tx_rtt_seq: to decay RTTVAR at most once per flight
- * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
- * @tx_expected_wnd: moving average of @tx_cwnd_used
- * @tx_cwnd_stamp: to track idle periods in CWV
- * @tx_lsndtime: last time (in jiffies) a data packet was sent
- * @tx_rpseq: last consecutive seqno
- * @tx_rpdupack: dupacks since rpseq
- * @tx_av_chunks: list of Ack Vectors received on current skb
- */
-struct ccid2_hc_tx_sock {
- u32 tx_cwnd;
- u32 tx_ssthresh;
- u32 tx_pipe;
- u32 tx_packets_acked;
- struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
- int tx_seqbufc;
- struct ccid2_seq *tx_seqh;
- struct ccid2_seq *tx_seqt;
-
- /* RTT measurement: variables/principles are the same as in TCP */
- u32 tx_srtt,
- tx_mdev,
- tx_mdev_max,
- tx_rttvar,
- tx_rto;
- u64 tx_rtt_seq:48;
- struct timer_list tx_rtotimer;
- struct sock *sk;
-
- /* Congestion Window validation (optional, RFC 2861) */
- u32 tx_cwnd_used,
- tx_expected_wnd,
- tx_cwnd_stamp,
- tx_lsndtime;
-
- u64 tx_rpseq;
- int tx_rpdupack;
- u32 tx_last_cong;
- u64 tx_high_ack;
- struct list_head tx_av_chunks;
-};
-
-static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
-{
- return hc->tx_pipe >= hc->tx_cwnd;
-}
-
-/*
- * Convert RFC 3390 larger initial window into an equivalent number of packets.
- * This is based on the numbers specified in RFC 5681, 3.1.
- */
-static inline u32 rfc3390_bytes_to_packets(const u32 smss)
-{
- return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
-}
-
-/**
- * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
- * @rx_num_data_pkts: number of data packets received since last feedback
- */
-struct ccid2_hc_rx_sock {
- u32 rx_num_data_pkts;
-};
-
-static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
-}
-
-static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
-}
-#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
deleted file mode 100644
index f349d16dd8f6..000000000000
--- a/net/dccp/ccids/ccid3.c
+++ /dev/null
@@ -1,866 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see https://www.wand.net.nz/
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-#include "../dccp.h"
-#include "ccid3.h"
-
-#include <linux/unaligned.h>
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static bool ccid3_debug;
-#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
-#else
-#define ccid3_pr_debug(format, a...)
-#endif
-
-/*
- * Transmitter Half-Connection Routines
- */
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
-{
- static const char *const ccid3_state_names[] = {
- [TFRC_SSTATE_NO_SENT] = "NO_SENT",
- [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
- [TFRC_SSTATE_FBACK] = "FBACK",
- };
-
- return ccid3_state_names[state];
-}
-#endif
-
-static void ccid3_hc_tx_set_state(struct sock *sk,
- enum ccid3_hc_tx_states state)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- enum ccid3_hc_tx_states oldstate = hc->tx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
- ccid3_tx_state_name(state));
- WARN_ON(state == oldstate);
- hc->tx_state = state;
-}
-
-/*
- * Compute the initial sending rate X_init in the manner of RFC 3390:
- *
- * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
- *
- * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
- * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
- * For consistency with other parts of the code, X_init is scaled by 2^6.
- */
-static inline u64 rfc3390_initial_rate(struct sock *sk)
-{
- const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
-
- return scaled_div(w_init << 6, hc->tx_rtt);
-}
-
-/**
- * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
- * @hc: socket to have the send interval updated
- *
- * This respects the granularity of X_inst (64 * bytes/second).
- */
-static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
-{
- hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
-
- DCCP_BUG_ON(hc->tx_t_ipi == 0);
- ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
- hc->tx_s, (unsigned int)(hc->tx_x >> 6));
-}
-
-static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
-{
- u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
-
- return delta / hc->tx_rtt;
-}
-
-/**
- * ccid3_hc_tx_update_x - Update allowed sending rate X
- * @sk: socket to be updated
- * @stamp: most recent time if available - can be left NULL.
- *
- * This function tracks draft rfc3448bis, check there for latest details.
- *
- * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
- * fine-grained resolution of sending rates. This requires scaling by 2^6
- * throughout the code. Only X_calc is unscaled (in bytes/second).
- *
- */
-static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- __u64 min_rate = 2 * hc->tx_x_recv;
- const __u64 old_x = hc->tx_x;
- ktime_t now = stamp ? *stamp : ktime_get_real();
-
- /*
- * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
- * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
- * a sender is idle if it has not sent anything over a 2-RTT-period.
- * For consistency with X and X_recv, min_rate is also scaled by 2^6.
- */
- if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
- min_rate = rfc3390_initial_rate(sk);
- min_rate = max(min_rate, 2 * hc->tx_x_recv);
- }
-
- if (hc->tx_p > 0) {
-
- hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
- hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
-
- } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
-
- hc->tx_x = min(2 * hc->tx_x, min_rate);
- hc->tx_x = max(hc->tx_x,
- scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
- hc->tx_t_ld = now;
- }
-
- if (hc->tx_x != old_x) {
- ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
- "X_recv=%u\n", (unsigned int)(old_x >> 6),
- (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
- (unsigned int)(hc->tx_x_recv >> 6));
-
- ccid3_update_send_interval(hc);
- }
-}
-
-/**
- * ccid3_hc_tx_update_s - Track the mean packet size `s'
- * @hc: socket to be updated
- * @len: DCCP packet payload size in bytes
- *
- * cf. RFC 4342, 5.3 and RFC 3448, 4.1
- */
-static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
-{
- const u16 old_s = hc->tx_s;
-
- hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
-
- if (hc->tx_s != old_s)
- ccid3_update_send_interval(hc);
-}
-
-/*
- * Update Window Counter using the algorithm from [RFC 4342, 8.1].
- * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
- */
-static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
- ktime_t now)
-{
- u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
- quarter_rtts = (4 * delta) / hc->tx_rtt;
-
- if (quarter_rtts > 0) {
- hc->tx_t_last_win_count = now;
- hc->tx_last_win_count += min(quarter_rtts, 5U);
- hc->tx_last_win_count &= 0xF; /* mod 16 */
- }
-}
-
-static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t)
-{
- struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer);
- struct sock *sk = hc->sk;
- unsigned long t_nfb = USEC_PER_SEC / 5;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- /* XXX: set some sensible MIB */
- goto restart_timer;
- }
-
- ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
- ccid3_tx_state_name(hc->tx_state));
-
- /* Ignore and do not restart after leaving the established state */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
- goto out;
-
- /* Reset feedback state to "no feedback received" */
- if (hc->tx_state == TFRC_SSTATE_FBACK)
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
- /*
- * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
- * RTO is 0 if and only if no feedback has been received yet.
- */
- if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
-
- /* halve send rate directly */
- hc->tx_x = max(hc->tx_x / 2,
- (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
- ccid3_update_send_interval(hc);
- } else {
- /*
- * Modify the cached value of X_recv
- *
- * If (X_calc > 2 * X_recv)
- * X_recv = max(X_recv / 2, s / (2 * t_mbi));
- * Else
- * X_recv = X_calc / 4;
- *
- * Note that X_recv is scaled by 2^6 while X_calc is not
- */
- if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
- hc->tx_x_recv =
- max(hc->tx_x_recv / 2,
- (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
- else {
- hc->tx_x_recv = hc->tx_x_calc;
- hc->tx_x_recv <<= 4;
- }
- ccid3_hc_tx_update_x(sk, NULL);
- }
- ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
- (unsigned long long)hc->tx_x);
-
- /*
- * Set new timeout for the nofeedback timer.
- * See comments in packet_recv() regarding the value of t_RTO.
- */
- if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
- t_nfb = TFRC_INITIAL_TIMEOUT;
- else
- t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
-
-restart_timer:
- sk_reset_timer(sk, &hc->tx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/**
- * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
- * @sk: socket to send packet from
- * @skb: next packet candidate to send on @sk
- *
- * This function uses the convention of ccid_packet_dequeue_eval() and
- * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
- */
-static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- ktime_t now = ktime_get_real();
- s64 delay;
-
- /*
- * This function is called only for Data and DataAck packets. Sending
- * zero-sized Data(Ack)s is theoretically possible, but for congestion
- * control this case is pathological - ignore it.
- */
- if (unlikely(skb->len == 0))
- return -EBADMSG;
-
- if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
- sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
- usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
- hc->tx_last_win_count = 0;
- hc->tx_t_last_win_count = now;
-
- /* Set t_0 for initial packet */
- hc->tx_t_nom = now;
-
- hc->tx_s = skb->len;
-
- /*
- * Use initial RTT sample when available: recommended by erratum
- * to RFC 4342. This implements the initialisation procedure of
- * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
- */
- if (dp->dccps_syn_rtt) {
- ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
- hc->tx_rtt = dp->dccps_syn_rtt;
- hc->tx_x = rfc3390_initial_rate(sk);
- hc->tx_t_ld = now;
- } else {
- /*
- * Sender does not have RTT sample:
- * - set fallback RTT (RFC 4340, 3.4) since a RTT value
- * is needed in several parts (e.g. window counter);
- * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
- */
- hc->tx_rtt = DCCP_FALLBACK_RTT;
- hc->tx_x = hc->tx_s;
- hc->tx_x <<= 6;
- }
- ccid3_update_send_interval(hc);
-
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
- } else {
- delay = ktime_us_delta(hc->tx_t_nom, now);
- ccid3_pr_debug("delay=%ld\n", (long)delay);
- /*
- * Scheduling of packet transmissions (RFC 5348, 8.3)
- *
- * if (t_now > t_nom - delta)
- * // send the packet now
- * else
- * // send the packet in (t_nom - t_now) milliseconds.
- */
- if (delay >= TFRC_T_DELTA)
- return (u32)delay / USEC_PER_MSEC;
-
- ccid3_hc_tx_update_win_count(hc, now);
- }
-
- /* prepare to send now (add options etc.) */
- dp->dccps_hc_tx_insert_options = 1;
- DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
-
- /* set the nominal send time for the next following packet */
- hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-
- ccid3_hc_tx_update_s(hc, len);
-
- if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
- DCCP_CRIT("packet history - out of memory!");
-}
-
-static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct tfrc_tx_hist_entry *acked;
- ktime_t now;
- unsigned long t_nfb;
- u32 r_sample;
-
- /* we are only interested in ACKs */
- if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
- DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
- return;
- /*
- * Locate the acknowledged packet in the TX history.
- *
- * Returning "entry not found" here can for instance happen when
- * - the host has not sent out anything (e.g. a passive server),
- * - the Ack is outdated (packet with higher Ack number was received),
- * - it is a bogus Ack (for a packet not sent on this connection).
- */
- acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
- if (acked == NULL)
- return;
- /* For the sake of RTT sampling, ignore/remove all older entries */
- tfrc_tx_hist_purge(&acked->next);
-
- /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
- now = ktime_get_real();
- r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
- hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
-
- /*
- * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
- */
- if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
-
- if (hc->tx_t_rto == 0) {
- /*
- * Initial feedback packet: Larger Initial Windows (4.2)
- */
- hc->tx_x = rfc3390_initial_rate(sk);
- hc->tx_t_ld = now;
-
- ccid3_update_send_interval(hc);
-
- goto done_computing_x;
- } else if (hc->tx_p == 0) {
- /*
- * First feedback after nofeedback timer expiry (4.3)
- */
- goto done_computing_x;
- }
- }
-
- /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
- if (hc->tx_p > 0)
- hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
- ccid3_hc_tx_update_x(sk, &now);
-
-done_computing_x:
- ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
- "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
- dccp_role(sk), sk, hc->tx_rtt, r_sample,
- hc->tx_s, hc->tx_p, hc->tx_x_calc,
- (unsigned int)(hc->tx_x_recv >> 6),
- (unsigned int)(hc->tx_x >> 6));
-
- /* unschedule no feedback timer */
- sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
- /*
- * As we have calculated new ipi, delta, t_nom it is possible
- * that we now can send a packet, so wake up dccp_wait_for_ccid
- */
- sk->sk_write_space(sk);
-
- /*
- * Update timeout interval for the nofeedback timer. In order to control
- * rate halving on networks with very low RTTs (<= 1 ms), use per-route
- * tunable RTAX_RTO_MIN value as the lower bound.
- */
- hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
- USEC_PER_SEC/HZ * tcp_rto_min(sk));
- /*
- * Schedule no feedback timer to expire in
- * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
- */
- t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
-
- ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
- "expire in %lu jiffies (%luus)\n",
- dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
-
- sk_reset_timer(sk, &hc->tx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-}
-
-static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
- u8 option, u8 *optval, u8 optlen)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- __be32 opt_val;
-
- switch (option) {
- case TFRC_OPT_RECEIVE_RATE:
- case TFRC_OPT_LOSS_EVENT_RATE:
- /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
- if (packet_type == DCCP_PKT_DATA)
- break;
- if (unlikely(optlen != 4)) {
- DCCP_WARN("%s(%p), invalid len %d for %u\n",
- dccp_role(sk), sk, optlen, option);
- return -EINVAL;
- }
- opt_val = ntohl(get_unaligned((__be32 *)optval));
-
- if (option == TFRC_OPT_RECEIVE_RATE) {
- /* Receive Rate is kept in units of 64 bytes/second */
- hc->tx_x_recv = opt_val;
- hc->tx_x_recv <<= 6;
-
- ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
- dccp_role(sk), sk, opt_val);
- } else {
- /* Update the fixpoint Loss Event Rate fraction */
- hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
-
- ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk, opt_val);
- }
- }
- return 0;
-}
-
-static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
-
- hc->tx_state = TFRC_SSTATE_NO_SENT;
- hc->tx_hist = NULL;
- hc->sk = sk;
- timer_setup(&hc->tx_no_feedback_timer,
- ccid3_hc_tx_no_feedback_timer, 0);
- return 0;
-}
-
-static void ccid3_hc_tx_exit(struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-
- sk_stop_timer(sk, &hc->tx_no_feedback_timer);
- tfrc_tx_hist_purge(&hc->tx_hist);
-}
-
-static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
-{
- info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
- info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
-}
-
-static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct tfrc_tx_info tfrc;
- const void *val;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(tfrc))
- return -EINVAL;
- memset(&tfrc, 0, sizeof(tfrc));
- tfrc.tfrctx_x = hc->tx_x;
- tfrc.tfrctx_x_recv = hc->tx_x_recv;
- tfrc.tfrctx_x_calc = hc->tx_x_calc;
- tfrc.tfrctx_rtt = hc->tx_rtt;
- tfrc.tfrctx_p = hc->tx_p;
- tfrc.tfrctx_rto = hc->tx_t_rto;
- tfrc.tfrctx_ipi = hc->tx_t_ipi;
- len = sizeof(tfrc);
- val = &tfrc;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-/*
- * Receiver Half-Connection Routines
- */
-
-/* CCID3 feedback types */
-enum ccid3_fback_type {
- CCID3_FBACK_NONE = 0,
- CCID3_FBACK_INITIAL,
- CCID3_FBACK_PERIODIC,
- CCID3_FBACK_PARAM_CHANGE
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
-{
- static const char *const ccid3_rx_state_names[] = {
- [TFRC_RSTATE_NO_DATA] = "NO_DATA",
- [TFRC_RSTATE_DATA] = "DATA",
- };
-
- return ccid3_rx_state_names[state];
-}
-#endif
-
-static void ccid3_hc_rx_set_state(struct sock *sk,
- enum ccid3_hc_rx_states state)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- enum ccid3_hc_rx_states oldstate = hc->rx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
- ccid3_rx_state_name(state));
- WARN_ON(state == oldstate);
- hc->rx_state = state;
-}
-
-static void ccid3_hc_rx_send_feedback(struct sock *sk,
- const struct sk_buff *skb,
- enum ccid3_fback_type fbtype)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- ktime_t now = ktime_get();
- s64 delta = 0;
-
- switch (fbtype) {
- case CCID3_FBACK_INITIAL:
- hc->rx_x_recv = 0;
- hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
- break;
- case CCID3_FBACK_PARAM_CHANGE:
- /*
- * When parameters change (new loss or p > p_prev), we do not
- * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
- * need to reuse the previous value of X_recv. However, when
- * X_recv was 0 (due to early loss), this would kill X down to
- * s/t_mbi (i.e. one packet in 64 seconds).
- * To avoid such drastic reduction, we approximate X_recv as
- * the number of bytes since last feedback.
- * This is a safe fallback, since X is bounded above by X_calc.
- */
- if (hc->rx_x_recv > 0)
- break;
- fallthrough;
- case CCID3_FBACK_PERIODIC:
- delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
- if (delta <= 0)
- delta = 1;
- hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
- break;
- default:
- return;
- }
-
- ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta,
- hc->rx_x_recv, hc->rx_pinv);
-
- hc->rx_tstamp_last_feedback = now;
- hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
- hc->rx_bytes_recv = 0;
-
- dp->dccps_hc_rx_insert_options = 1;
- dccp_send_ack(sk);
-}
-
-static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- __be32 x_recv, pinv;
-
- if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
- return 0;
-
- if (dccp_packet_without_ack(skb))
- return 0;
-
- x_recv = htonl(hc->rx_x_recv);
- pinv = htonl(hc->rx_pinv);
-
- if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
- &pinv, sizeof(pinv)) ||
- dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
- &x_recv, sizeof(x_recv)))
- return -1;
-
- return 0;
-}
-
-/**
- * ccid3_first_li - Implements [RFC 5348, 6.3.1]
- * @sk: socket to calculate loss interval for
- *
- * Determine the length of the first loss interval via inverse lookup.
- * Assume that X_recv can be computed by the throughput equation
- * s
- * X_recv = --------
- * R * fval
- * Find some p such that f(p) = fval; return 1/p (scaled).
- */
-static u32 ccid3_first_li(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- u32 x_recv, p;
- s64 delta;
- u64 fval;
-
- if (hc->rx_rtt == 0) {
- DCCP_WARN("No RTT estimate available, using fallback RTT\n");
- hc->rx_rtt = DCCP_FALLBACK_RTT;
- }
-
- delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback);
- if (delta <= 0)
- delta = 1;
- x_recv = scaled_div32(hc->rx_bytes_recv, delta);
- if (x_recv == 0) { /* would also trigger divide-by-zero */
- DCCP_WARN("X_recv==0\n");
- if (hc->rx_x_recv == 0) {
- DCCP_BUG("stored value of X_recv is zero");
- return ~0U;
- }
- x_recv = hc->rx_x_recv;
- }
-
- fval = scaled_div(hc->rx_s, hc->rx_rtt);
- fval = scaled_div32(fval, x_recv);
- p = tfrc_calc_x_reverse_lookup(fval);
-
- ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
- "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
-
- return p == 0 ? ~0U : scaled_div(1, p);
-}
-
-static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
- const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
- const bool is_data_packet = dccp_data_packet(skb);
-
- if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
- if (is_data_packet) {
- const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
- do_feedback = CCID3_FBACK_INITIAL;
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
- hc->rx_s = payload;
- /*
- * Not necessary to update rx_bytes_recv here,
- * since X_recv = 0 for the first feedback packet (cf.
- * RFC 3448, 6.3) -- gerrit
- */
- }
- goto update_records;
- }
-
- if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
- return; /* done receiving */
-
- if (is_data_packet) {
- const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
- /*
- * Update moving-average of s and the sum of received payload bytes
- */
- hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
- hc->rx_bytes_recv += payload;
- }
-
- /*
- * Perform loss detection and handle pending losses
- */
- if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
- skb, ndp, ccid3_first_li, sk)) {
- do_feedback = CCID3_FBACK_PARAM_CHANGE;
- goto done_receiving;
- }
-
- if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
- return; /* done receiving */
-
- /*
- * Handle data packets: RTT sampling and monitoring p
- */
- if (unlikely(!is_data_packet))
- goto update_records;
-
- if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
- const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
- /*
- * Empty loss history: no loss so far, hence p stays 0.
- * Sample RTT values, since an RTT estimate is required for the
- * computation of p when the first loss occurs; RFC 3448, 6.3.1.
- */
- if (sample != 0)
- hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
-
- } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
- /*
- * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
- * has decreased (resp. p has increased), send feedback now.
- */
- do_feedback = CCID3_FBACK_PARAM_CHANGE;
- }
-
- /*
- * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
- */
- if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
- do_feedback = CCID3_FBACK_PERIODIC;
-
-update_records:
- tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
-
-done_receiving:
- if (do_feedback)
- ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
-}
-
-static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
-
- hc->rx_state = TFRC_RSTATE_NO_DATA;
- tfrc_lh_init(&hc->rx_li_hist);
- return tfrc_rx_hist_alloc(&hc->rx_hist);
-}
-
-static void ccid3_hc_rx_exit(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
-
- tfrc_rx_hist_purge(&hc->rx_hist);
- tfrc_lh_cleanup(&hc->rx_li_hist);
-}
-
-static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
-{
- info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
- info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
-}
-
-static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- struct tfrc_rx_info rx_info;
- const void *val;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_RX_INFO:
- if (len < sizeof(rx_info))
- return -EINVAL;
- rx_info.tfrcrx_x_recv = hc->rx_x_recv;
- rx_info.tfrcrx_rtt = hc->rx_rtt;
- rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
- len = sizeof(rx_info);
- val = &rx_info;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-struct ccid_operations ccid3_ops = {
- .ccid_id = DCCPC_CCID3,
- .ccid_name = "TCP-Friendly Rate Control",
- .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
- .ccid_hc_tx_init = ccid3_hc_tx_init,
- .ccid_hc_tx_exit = ccid3_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
- .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
- .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
- .ccid_hc_rx_init = ccid3_hc_rx_init,
- .ccid_hc_rx_exit = ccid3_hc_rx_exit,
- .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
- .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
- .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
- .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
- .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
- .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, bool, 0644);
-MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
-#endif
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
deleted file mode 100644
index 02e0fc9f6334..000000000000
--- a/net/dccp/ccids/ccid3.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see https://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-#ifndef _DCCP_CCID3_H_
-#define _DCCP_CCID3_H_
-
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/tfrc.h>
-#include "lib/tfrc.h"
-#include "../ccid.h"
-
-/* Two seconds as per RFC 5348, 4.2 */
-#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-
-/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
-#define TFRC_T_MBI 64
-
-/*
- * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
- * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
- * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
- * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
- * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
- */
-#if (HZ >= 500)
-# define TFRC_T_DELTA USEC_PER_MSEC
-#else
-# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
-#endif
-
-enum ccid3_options {
- TFRC_OPT_LOSS_EVENT_RATE = 192,
- TFRC_OPT_LOSS_INTERVALS = 193,
- TFRC_OPT_RECEIVE_RATE = 194,
-};
-
-/* TFRC sender states */
-enum ccid3_hc_tx_states {
- TFRC_SSTATE_NO_SENT = 1,
- TFRC_SSTATE_NO_FBACK,
- TFRC_SSTATE_FBACK,
-};
-
-/**
- * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
- * @tx_x: Current sending rate in 64 * bytes per second
- * @tx_x_recv: Receive rate in 64 * bytes per second
- * @tx_x_calc: Calculated rate in bytes per second
- * @tx_rtt: Estimate of current round trip time in usecs
- * @tx_p: Current loss event rate (0-1) scaled by 1000000
- * @tx_s: Packet size in bytes
- * @tx_t_rto: Nofeedback Timer setting in usecs
- * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @tx_state: Sender state, one of %ccid3_hc_tx_states
- * @tx_last_win_count: Last window counter sent
- * @tx_t_last_win_count: Timestamp of earliest packet
- * with last_win_count value sent
- * @tx_no_feedback_timer: Handle to no feedback timer
- * @tx_t_ld: Time last doubled during slow start
- * @tx_t_nom: Nominal send time of next packet
- * @tx_hist: Packet history
- */
-struct ccid3_hc_tx_sock {
- u64 tx_x;
- u64 tx_x_recv;
- u32 tx_x_calc;
- u32 tx_rtt;
- u32 tx_p;
- u32 tx_t_rto;
- u32 tx_t_ipi;
- u16 tx_s;
- enum ccid3_hc_tx_states tx_state:8;
- u8 tx_last_win_count;
- ktime_t tx_t_last_win_count;
- struct timer_list tx_no_feedback_timer;
- struct sock *sk;
- ktime_t tx_t_ld;
- ktime_t tx_t_nom;
- struct tfrc_tx_hist_entry *tx_hist;
-};
-
-static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
- BUG_ON(hctx == NULL);
- return hctx;
-}
-
-/* TFRC receiver states */
-enum ccid3_hc_rx_states {
- TFRC_RSTATE_NO_DATA = 1,
- TFRC_RSTATE_DATA,
-};
-
-/**
- * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
- * @rx_state: Receiver state, one of %ccid3_hc_rx_states
- * @rx_bytes_recv: Total sum of DCCP payload bytes
- * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
- * @rx_rtt: Receiver estimate of RTT
- * @rx_tstamp_last_feedback: Time at which last feedback was sent
- * @rx_hist: Packet history (loss detection + RTT sampling)
- * @rx_li_hist: Loss Interval database
- * @rx_s: Received packet size in bytes
- * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
- */
-struct ccid3_hc_rx_sock {
- u8 rx_last_counter:4;
- enum ccid3_hc_rx_states rx_state:8;
- u32 rx_bytes_recv;
- u32 rx_x_recv;
- u32 rx_rtt;
- ktime_t rx_tstamp_last_feedback;
- struct tfrc_rx_hist rx_hist;
- struct tfrc_loss_hist rx_li_hist;
- u16 rx_s;
-#define rx_pinv rx_li_hist.i_mean
-};
-
-static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
- BUG_ON(hcrx == NULL);
- return hcrx;
-}
-
-#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
deleted file mode 100644
index da95319842bb..000000000000
--- a/net/dccp/ccids/lib/loss_interval.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-#include <net/sock.h>
-#include "tfrc.h"
-
-static struct kmem_cache *tfrc_lh_slab __read_mostly;
-/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
-static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
-
-/* implements LIFO semantics on the array */
-static inline u8 LIH_INDEX(const u8 ctr)
-{
- return LIH_SIZE - 1 - (ctr % LIH_SIZE);
-}
-
-/* the `counter' index always points at the next entry to be populated */
-static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
-{
- return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
-}
-
-/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
-static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
-{
- BUG_ON(i >= lh->counter);
- return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
-}
-
-/*
- * On-demand allocation and de-allocation of entries
- */
-static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
-{
- if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
- lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
- GFP_ATOMIC);
- return lh->ring[LIH_INDEX(lh->counter)];
-}
-
-void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
-{
- if (!tfrc_lh_is_initialised(lh))
- return;
-
- for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
- if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
- kmem_cache_free(tfrc_lh_slab,
- lh->ring[LIH_INDEX(lh->counter)]);
- lh->ring[LIH_INDEX(lh->counter)] = NULL;
- }
-}
-
-static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
-{
- u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
- int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
-
- if (k <= 0)
- return;
-
- for (i = 0; i <= k; i++) {
- i_i = tfrc_lh_get_interval(lh, i);
-
- if (i < k) {
- i_tot0 += i_i * tfrc_lh_weights[i];
- w_tot += tfrc_lh_weights[i];
- }
- if (i > 0)
- i_tot1 += i_i * tfrc_lh_weights[i-1];
- }
-
- lh->i_mean = max(i_tot0, i_tot1) / w_tot;
-}
-
-/**
- * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
- * @lh: histogram to update
- * @skb: received socket triggering loss interval update
- *
- * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
- */
-u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
-{
- struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
- u32 old_i_mean = lh->i_mean;
- s64 len;
-
- if (cur == NULL) /* not initialised */
- return 0;
-
- len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
-
- if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
- return 0;
-
- if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
- /*
- * Implements RFC 4342, 10.2:
- * If a packet S (skb) exists whose seqno comes `after' the one
- * starting the current loss interval (cur) and if the modulo-16
- * distance from C(cur) to C(S) is greater than 4, consider all
- * subsequent packets as belonging to a new loss interval. This
- * test is necessary since CCVal may wrap between intervals.
- */
- cur->li_is_closed = 1;
-
- if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
- return 0;
-
- cur->li_length = len;
- tfrc_lh_calc_i_mean(lh);
-
- return lh->i_mean < old_i_mean;
-}
-
-/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
-static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
- struct tfrc_rx_hist_entry *new_loss)
-{
- return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
- (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
-}
-
-/**
- * tfrc_lh_interval_add - Insert new record into the Loss Interval database
- * @lh: Loss Interval database
- * @rh: Receive history containing a fresh loss event
- * @calc_first_li: Caller-dependent routine to compute length of first interval
- * @sk: Used by @calc_first_li in caller-specific way (subtyping)
- *
- * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
- */
-int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
- u32 (*calc_first_li)(struct sock *), struct sock *sk)
-{
- struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
-
- if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
- return 0;
-
- new = tfrc_lh_demand_next(lh);
- if (unlikely(new == NULL)) {
- DCCP_CRIT("Cannot allocate/add loss record.");
- return 0;
- }
-
- new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
- new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
- new->li_is_closed = 0;
-
- if (++lh->counter == 1)
- lh->i_mean = new->li_length = (*calc_first_li)(sk);
- else {
- cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
- new->li_length = dccp_delta_seqno(new->li_seqno,
- tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
- if (lh->counter > (2*LIH_SIZE))
- lh->counter -= LIH_SIZE;
-
- tfrc_lh_calc_i_mean(lh);
- }
- return 1;
-}
-
-int __init tfrc_li_init(void)
-{
- tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
- sizeof(struct tfrc_loss_interval), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_li_exit(void)
-{
- if (tfrc_lh_slab != NULL) {
- kmem_cache_destroy(tfrc_lh_slab);
- tfrc_lh_slab = NULL;
- }
-}
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
deleted file mode 100644
index c3d95f85e43b..000000000000
--- a/net/dccp/ccids/lib/loss_interval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _DCCP_LI_HIST_
-#define _DCCP_LI_HIST_
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-/*
- * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
- * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
- */
-#define NINTERVAL 8
-#define LIH_SIZE (NINTERVAL + 1)
-
-/**
- * tfrc_loss_interval - Loss history record for TFRC-based protocols
- * @li_seqno: Highest received seqno before the start of loss
- * @li_ccval: The CCVal belonging to @li_seqno
- * @li_is_closed: Whether @li_seqno is older than 1 RTT
- * @li_length: Loss interval sequence length
- */
-struct tfrc_loss_interval {
- u64 li_seqno:48,
- li_ccval:4,
- li_is_closed:1;
- u32 li_length;
-};
-
-/**
- * tfrc_loss_hist - Loss record database
- * @ring: Circular queue managed in LIFO manner
- * @counter: Current count of entries (can be more than %LIH_SIZE)
- * @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
- */
-struct tfrc_loss_hist {
- struct tfrc_loss_interval *ring[LIH_SIZE];
- u8 counter;
- u32 i_mean;
-};
-
-static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
-{
- memset(lh, 0, sizeof(struct tfrc_loss_hist));
-}
-
-static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
-{
- return lh->counter > 0;
-}
-
-static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
-{
- return min(lh->counter, (u8)LIH_SIZE);
-}
-
-struct tfrc_rx_hist;
-
-int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
- u32 (*first_li)(struct sock *), struct sock *);
-u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
-void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
-
-#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
deleted file mode 100644
index 0cdda3c66fb5..000000000000
--- a/net/dccp/ccids/lib/packet_history.c
+++ /dev/null
@@ -1,439 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see https://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "packet_history.h"
-#include "../../dccp.h"
-
-/*
- * Transmitter History Routines
- */
-static struct kmem_cache *tfrc_tx_hist_slab;
-
-int __init tfrc_tx_packet_history_init(void)
-{
- tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
- sizeof(struct tfrc_tx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_tx_packet_history_exit(void)
-{
- if (tfrc_tx_hist_slab != NULL) {
- kmem_cache_destroy(tfrc_tx_hist_slab);
- tfrc_tx_hist_slab = NULL;
- }
-}
-
-int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
-{
- struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
-
- if (entry == NULL)
- return -ENOBUFS;
- entry->seqno = seqno;
- entry->stamp = ktime_get_real();
- entry->next = *headp;
- *headp = entry;
- return 0;
-}
-
-void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
-{
- struct tfrc_tx_hist_entry *head = *headp;
-
- while (head != NULL) {
- struct tfrc_tx_hist_entry *next = head->next;
-
- kmem_cache_free(tfrc_tx_hist_slab, head);
- head = next;
- }
-
- *headp = NULL;
-}
-
-/*
- * Receiver History Routines
- */
-static struct kmem_cache *tfrc_rx_hist_slab;
-
-int __init tfrc_rx_packet_history_init(void)
-{
- tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
- sizeof(struct tfrc_rx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_rx_packet_history_exit(void)
-{
- if (tfrc_rx_hist_slab != NULL) {
- kmem_cache_destroy(tfrc_rx_hist_slab);
- tfrc_rx_hist_slab = NULL;
- }
-}
-
-static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
- const struct sk_buff *skb,
- const u64 ndp)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
-
- entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- entry->tfrchrx_ccval = dh->dccph_ccval;
- entry->tfrchrx_type = dh->dccph_type;
- entry->tfrchrx_ndp = ndp;
- entry->tfrchrx_tstamp = ktime_get_real();
-}
-
-void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
- const struct sk_buff *skb,
- const u64 ndp)
-{
- struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
-
- tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
-}
-
-/* has the packet contained in skb been seen before? */
-int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
-{
- const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
- int i;
-
- if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
- return 1;
-
- for (i = 1; i <= h->loss_count; i++)
- if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
- return 1;
-
- return 0;
-}
-
-static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
-{
- const u8 idx_a = tfrc_rx_hist_index(h, a),
- idx_b = tfrc_rx_hist_index(h, b);
-
- swap(h->ring[idx_a], h->ring[idx_b]);
-}
-
-/*
- * Private helper functions for loss detection.
- *
- * In the descriptions, `Si' refers to the sequence number of entry number i,
- * whose NDP count is `Ni' (lower case is used for variables).
- * Note: All __xxx_loss functions expect that a test against duplicates has been
- * performed already: the seqno of the skb must not be less than the seqno
- * of loss_prev; and it must not equal that of any valid history entry.
- */
-static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
- h->loss_count = 1;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
- }
-}
-
-static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
- h->loss_count = 2;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
- return;
- }
-
- /* S0 < S2 < S1 */
-
- if (dccp_loss_free(s0, s2, n2)) {
- u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
-
- if (dccp_loss_free(s2, s1, n1)) {
- /* hole is filled: S0, S2, and S1 are consecutive */
- h->loss_count = 0;
- h->loss_start = tfrc_rx_hist_index(h, 1);
- } else
- /* gap between S2 and S1: just update loss_prev */
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
-
- } else { /* gap between S0 and S2 */
- /*
- * Reorder history to insert S2 between S0 and S1
- */
- tfrc_rx_hist_swap(h, 0, 3);
- h->loss_start = tfrc_rx_hist_index(h, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
- h->loss_count = 2;
- }
-}
-
-/* return 1 if a new loss event has been identified */
-static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
- s3 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
- h->loss_count = 3;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
- return 1;
- }
-
- /* S3 < S2 */
-
- if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
- /*
- * Reorder history to insert S3 between S1 and S2
- */
- tfrc_rx_hist_swap(h, 2, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
- h->loss_count = 3;
- return 1;
- }
-
- /* S0 < S3 < S1 */
-
- if (dccp_loss_free(s0, s3, n3)) {
- u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
-
- if (dccp_loss_free(s3, s1, n1)) {
- /* hole between S0 and S1 filled by S3 */
- u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
-
- if (dccp_loss_free(s1, s2, n2)) {
- /* entire hole filled by S0, S3, S1, S2 */
- h->loss_start = tfrc_rx_hist_index(h, 2);
- h->loss_count = 0;
- } else {
- /* gap remains between S1 and S2 */
- h->loss_start = tfrc_rx_hist_index(h, 1);
- h->loss_count = 1;
- }
-
- } else /* gap exists between S3 and S1, loss_count stays at 2 */
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
-
- return 0;
- }
-
- /*
- * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
- * Reorder history to insert S3 between S0 and S1.
- */
- tfrc_rx_hist_swap(h, 0, 3);
- h->loss_start = tfrc_rx_hist_index(h, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
- h->loss_count = 3;
-
- return 1;
-}
-
-/* recycle RX history records to continue loss detection if necessary */
-static void __three_after_loss(struct tfrc_rx_hist *h)
-{
- /*
- * At this stage we know already that there is a gap between S0 and S1
- * (since S0 was the highest sequence number received before detecting
- * the loss). To recycle the loss record, it is thus only necessary to
- * check for other possible gaps between S1/S2 and between S2/S3.
- */
- u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
- s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
- u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
- n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
-
- if (dccp_loss_free(s1, s2, n2)) {
-
- if (dccp_loss_free(s2, s3, n3)) {
- /* no gap between S2 and S3: entire hole is filled */
- h->loss_start = tfrc_rx_hist_index(h, 3);
- h->loss_count = 0;
- } else {
- /* gap between S2 and S3 */
- h->loss_start = tfrc_rx_hist_index(h, 2);
- h->loss_count = 1;
- }
-
- } else { /* gap between S1 and S2 */
- h->loss_start = tfrc_rx_hist_index(h, 1);
- h->loss_count = 2;
- }
-}
-
-/**
- * tfrc_rx_handle_loss - Loss detection and further processing
- * @h: The non-empty RX history object
- * @lh: Loss Intervals database to update
- * @skb: Currently received packet
- * @ndp: The NDP count belonging to @skb
- * @calc_first_li: Caller-dependent computation of first loss interval in @lh
- * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
- *
- * Chooses action according to pending loss, updates LI database when a new
- * loss was detected, and does required post-processing. Returns 1 when caller
- * should send feedback, 0 otherwise.
- * Since it also takes care of reordering during loss detection and updates the
- * records accordingly, the caller should not perform any more RX history
- * operations when loss_count is greater than 0 after calling this function.
- */
-int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
- struct tfrc_loss_hist *lh,
- struct sk_buff *skb, const u64 ndp,
- u32 (*calc_first_li)(struct sock *), struct sock *sk)
-{
- int is_new_loss = 0;
-
- if (h->loss_count == 0) {
- __do_track_loss(h, skb, ndp);
- } else if (h->loss_count == 1) {
- __one_after_loss(h, skb, ndp);
- } else if (h->loss_count != 2) {
- DCCP_BUG("invalid loss_count %d", h->loss_count);
- } else if (__two_after_loss(h, skb, ndp)) {
- /*
- * Update Loss Interval database and recycle RX records
- */
- is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
- __three_after_loss(h);
- }
- return is_new_loss;
-}
-
-int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
-{
- int i;
-
- for (i = 0; i <= TFRC_NDUPACK; i++) {
- h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
- if (h->ring[i] == NULL)
- goto out_free;
- }
-
- h->loss_count = h->loss_start = 0;
- return 0;
-
-out_free:
- while (i-- != 0) {
- kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
- h->ring[i] = NULL;
- }
- return -ENOBUFS;
-}
-
-void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
-{
- int i;
-
- for (i = 0; i <= TFRC_NDUPACK; ++i)
- if (h->ring[i] != NULL) {
- kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
- h->ring[i] = NULL;
- }
-}
-
-/**
- * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
- * @h: The non-empty RX history object
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
-{
- return h->ring[0];
-}
-
-/**
- * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
- * @h: The non-empty RX history object
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
-{
- return h->ring[h->rtt_sample_prev];
-}
-
-/**
- * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
- * @h: receive histogram
- * @skb: packet containing timestamp.
- *
- * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
- * to compute a sample with given data - calling function should check this.
- */
-u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
-{
- u32 sample = 0,
- delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
-
- if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
- if (h->rtt_sample_prev == 2) { /* previous candidate stored */
- sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
- if (sample)
- sample = 4 / sample *
- ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
- else /*
- * FIXME: This condition is in principle not
- * possible but occurs when CCID is used for
- * two-way data traffic. I have tried to trace
- * it, but the cause does not seem to be here.
- */
- DCCP_BUG("please report to dccp@vger.kernel.org"
- " => prev = %u, last = %u",
- tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
- } else if (delta_v < 1) {
- h->rtt_sample_prev = 1;
- goto keep_ref_for_next_time;
- }
-
- } else if (delta_v == 4) /* optimal match */
- sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
- else { /* suboptimal match */
- h->rtt_sample_prev = 2;
- goto keep_ref_for_next_time;
- }
-
- if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
- DCCP_WARN("RTT sample %u too large, using max\n", sample);
- sample = DCCP_SANE_RTT_MAX;
- }
-
- h->rtt_sample_prev = 0; /* use current entry as next reference */
-keep_ref_for_next_time:
-
- return sample;
-}
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
deleted file mode 100644
index 159cc9326eab..000000000000
--- a/net/dccp/ccids/lib/packet_history.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Packet RX/TX history data structures and routines for TFRC-based protocols.
- *
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see https://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#ifndef _DCCP_PKT_HIST_
-#define _DCCP_PKT_HIST_
-
-#include <linux/list.h>
-#include <linux/slab.h>
-#include "tfrc.h"
-
-/**
- * tfrc_tx_hist_entry - Simple singly-linked TX history list
- * @next: next oldest entry (LIFO order)
- * @seqno: sequence number of this entry
- * @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
- struct tfrc_tx_hist_entry *next;
- u64 seqno;
- ktime_t stamp;
-};
-
-static inline struct tfrc_tx_hist_entry *
- tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
- while (head != NULL && head->seqno != seqno)
- head = head->next;
- return head;
-}
-
-int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
-void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-
-/* Subtraction a-b modulo-16, respects circular wrap-around */
-#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
-
-/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
-#define TFRC_NDUPACK 3
-
-/**
- * tfrc_rx_hist_entry - Store information about a single received packet
- * @tfrchrx_seqno: DCCP packet sequence number
- * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
- * @tfrchrx_ndp: the NDP count (if any) of the packet
- * @tfrchrx_tstamp: actual receive time of packet
- */
-struct tfrc_rx_hist_entry {
- u64 tfrchrx_seqno:48,
- tfrchrx_ccval:4,
- tfrchrx_type:4;
- u64 tfrchrx_ndp:48;
- ktime_t tfrchrx_tstamp;
-};
-
-/**
- * tfrc_rx_hist - RX history structure for TFRC-based protocols
- * @ring: Packet history for RTT sampling and loss detection
- * @loss_count: Number of entries in circular history
- * @loss_start: Movable index (for loss detection)
- * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
- */
-struct tfrc_rx_hist {
- struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
- u8 loss_count:2,
- loss_start:2;
-#define rtt_sample_prev loss_start
-};
-
-/**
- * tfrc_rx_hist_index - index to reach n-th entry after loss_start
- */
-static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
-{
- return (h->loss_start + n) & TFRC_NDUPACK;
-}
-
-/**
- * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
-{
- return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
-}
-
-/**
- * tfrc_rx_hist_entry - return the n-th history entry after loss_start
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
-{
- return h->ring[tfrc_rx_hist_index(h, n)];
-}
-
-/**
- * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
-{
- return h->ring[h->loss_start];
-}
-
-/* indicate whether previously a packet was detected missing */
-static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
-{
- return h->loss_count > 0;
-}
-
-void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
- const u64 ndp);
-
-int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
-
-struct tfrc_loss_hist;
-int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
- struct sk_buff *skb, const u64 ndp,
- u32 (*first_li)(struct sock *sk), struct sock *sk);
-u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
-int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
-void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
-
-#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
deleted file mode 100644
index d7f265e1f50c..000000000000
--- a/net/dccp/ccids/lib/tfrc.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * TFRC library initialisation
- *
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
- */
-#include <linux/moduleparam.h>
-#include "tfrc.h"
-
-#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-bool tfrc_debug;
-module_param(tfrc_debug, bool, 0644);
-MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
-#endif
-
-int __init tfrc_lib_init(void)
-{
- int rc = tfrc_li_init();
-
- if (rc)
- goto out;
-
- rc = tfrc_tx_packet_history_init();
- if (rc)
- goto out_free_loss_intervals;
-
- rc = tfrc_rx_packet_history_init();
- if (rc)
- goto out_free_tx_history;
- return 0;
-
-out_free_tx_history:
- tfrc_tx_packet_history_exit();
-out_free_loss_intervals:
- tfrc_li_exit();
-out:
- return rc;
-}
-
-void tfrc_lib_exit(void)
-{
- tfrc_rx_packet_history_exit();
- tfrc_tx_packet_history_exit();
- tfrc_li_exit();
-}
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
deleted file mode 100644
index 0a63e8750cc5..000000000000
--- a/net/dccp/ccids/lib/tfrc.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _TFRC_H_
-#define _TFRC_H_
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- */
-#include <linux/types.h>
-#include <linux/math64.h>
-#include "../../dccp.h"
-
-/* internal includes that this library exports: */
-#include "loss_interval.h"
-#include "packet_history.h"
-
-#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-extern bool tfrc_debug;
-#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
-#else
-#define tfrc_pr_debug(format, a...)
-#endif
-
-/* integer-arithmetic divisions of type (a * 1000000)/b */
-static inline u64 scaled_div(u64 a, u64 b)
-{
- BUG_ON(b == 0);
- return div64_u64(a * 1000000, b);
-}
-
-static inline u32 scaled_div32(u64 a, u64 b)
-{
- u64 result = scaled_div(a, b);
-
- if (result > UINT_MAX) {
- DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
- (unsigned long long)a, (unsigned long long)b);
- return UINT_MAX;
- }
- return result;
-}
-
-/**
- * tfrc_ewma - Exponentially weighted moving average
- * @weight: Weight to be used as damping factor, in units of 1/10
- */
-static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
-{
- return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
-}
-
-u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
-
-int tfrc_tx_packet_history_init(void);
-void tfrc_tx_packet_history_exit(void);
-int tfrc_rx_packet_history_init(void);
-void tfrc_rx_packet_history_exit(void);
-
-int tfrc_li_init(void);
-void tfrc_li_exit(void);
-
-#ifdef CONFIG_IP_DCCP_TFRC_LIB
-int tfrc_lib_init(void);
-void tfrc_lib_exit(void);
-#else
-#define tfrc_lib_init() (0)
-#define tfrc_lib_exit()
-#endif
-#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
deleted file mode 100644
index 92a8c6bea316..000000000000
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ /dev/null
@@ -1,702 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- */
-
-#include <linux/module.h>
-#include "../../dccp.h"
-#include "tfrc.h"
-
-#define TFRC_CALC_X_ARRSIZE 500
-#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
-#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
-
-/*
- TFRC TCP Reno Throughput Equation Lookup Table for f(p)
-
- The following two-column lookup table implements a part of the TCP throughput
- equation from [RFC 3448, sec. 3.1]:
-
- s
- X_calc = --------------------------------------------------------------
- R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
-
- Where:
- X is the transmit rate in bytes/second
- s is the packet size in bytes
- R is the round trip time in seconds
- p is the loss event rate, between 0 and 1.0, of the number of loss
- events as a fraction of the number of packets transmitted
- t_RTO is the TCP retransmission timeout value in seconds
- b is the number of packets acknowledged by a single TCP ACK
-
- We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
-
- s
- X_calc = -------------------------------------------------------
- R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
-
- which we can break down into:
-
- s
- X_calc = ---------
- R * f(p)
-
- where f(p) is given for 0 < p <= 1 by:
-
- f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
-
- Since this is kernel code, floating-point arithmetic is avoided in favour of
- integer arithmetic. This means that nearly all fractional parameters are
- scaled by 1000000:
- * the parameters p and R
- * the return result f(p)
- The lookup table therefore actually tabulates the following function g(q):
-
- g(q) = 1000000 * f(q/1000000)
-
- Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
- granularity for the practically more relevant case of small values of p (up to
- 5%), the second column is used; the first one ranges up to 100%. This split
- corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
- determines the smallest resolution possible with this lookup table:
-
- TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
-
- The entire table is generated by:
- for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
- lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
- lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
- }
-
- With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
- lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
- lookup[M][0] = g(1000000) = 1000000 * f(100%)
- lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
- lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
-
- In summary, the two columns represent f(p) for the following ranges:
- * The first column is for 0.002 <= p <= 1.0
- * The second column is for 0.0001 <= p <= 0.05
- Where the columns overlap, the second (finer-grained) is given preference,
- i.e. the first column is used only for p >= 0.05.
- */
-static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
- { 37172, 8172 },
- { 53499, 11567 },
- { 66664, 14180 },
- { 78298, 16388 },
- { 89021, 18339 },
- { 99147, 20108 },
- { 108858, 21738 },
- { 118273, 23260 },
- { 127474, 24693 },
- { 136520, 26052 },
- { 145456, 27348 },
- { 154316, 28589 },
- { 163130, 29783 },
- { 171919, 30935 },
- { 180704, 32049 },
- { 189502, 33130 },
- { 198328, 34180 },
- { 207194, 35202 },
- { 216114, 36198 },
- { 225097, 37172 },
- { 234153, 38123 },
- { 243294, 39055 },
- { 252527, 39968 },
- { 261861, 40864 },
- { 271305, 41743 },
- { 280866, 42607 },
- { 290553, 43457 },
- { 300372, 44293 },
- { 310333, 45117 },
- { 320441, 45929 },
- { 330705, 46729 },
- { 341131, 47518 },
- { 351728, 48297 },
- { 362501, 49066 },
- { 373460, 49826 },
- { 384609, 50577 },
- { 395958, 51320 },
- { 407513, 52054 },
- { 419281, 52780 },
- { 431270, 53499 },
- { 443487, 54211 },
- { 455940, 54916 },
- { 468635, 55614 },
- { 481581, 56306 },
- { 494785, 56991 },
- { 508254, 57671 },
- { 521996, 58345 },
- { 536019, 59014 },
- { 550331, 59677 },
- { 564939, 60335 },
- { 579851, 60988 },
- { 595075, 61636 },
- { 610619, 62279 },
- { 626491, 62918 },
- { 642700, 63553 },
- { 659253, 64183 },
- { 676158, 64809 },
- { 693424, 65431 },
- { 711060, 66050 },
- { 729073, 66664 },
- { 747472, 67275 },
- { 766266, 67882 },
- { 785464, 68486 },
- { 805073, 69087 },
- { 825103, 69684 },
- { 845562, 70278 },
- { 866460, 70868 },
- { 887805, 71456 },
- { 909606, 72041 },
- { 931873, 72623 },
- { 954614, 73202 },
- { 977839, 73778 },
- { 1001557, 74352 },
- { 1025777, 74923 },
- { 1050508, 75492 },
- { 1075761, 76058 },
- { 1101544, 76621 },
- { 1127867, 77183 },
- { 1154739, 77741 },
- { 1182172, 78298 },
- { 1210173, 78852 },
- { 1238753, 79405 },
- { 1267922, 79955 },
- { 1297689, 80503 },
- { 1328066, 81049 },
- { 1359060, 81593 },
- { 1390684, 82135 },
- { 1422947, 82675 },
- { 1455859, 83213 },
- { 1489430, 83750 },
- { 1523671, 84284 },
- { 1558593, 84817 },
- { 1594205, 85348 },
- { 1630518, 85878 },
- { 1667543, 86406 },
- { 1705290, 86932 },
- { 1743770, 87457 },
- { 1782994, 87980 },
- { 1822973, 88501 },
- { 1863717, 89021 },
- { 1905237, 89540 },
- { 1947545, 90057 },
- { 1990650, 90573 },
- { 2034566, 91087 },
- { 2079301, 91600 },
- { 2124869, 92111 },
- { 2171279, 92622 },
- { 2218543, 93131 },
- { 2266673, 93639 },
- { 2315680, 94145 },
- { 2365575, 94650 },
- { 2416371, 95154 },
- { 2468077, 95657 },
- { 2520707, 96159 },
- { 2574271, 96660 },
- { 2628782, 97159 },
- { 2684250, 97658 },
- { 2740689, 98155 },
- { 2798110, 98651 },
- { 2856524, 99147 },
- { 2915944, 99641 },
- { 2976382, 100134 },
- { 3037850, 100626 },
- { 3100360, 101117 },
- { 3163924, 101608 },
- { 3228554, 102097 },
- { 3294263, 102586 },
- { 3361063, 103073 },
- { 3428966, 103560 },
- { 3497984, 104045 },
- { 3568131, 104530 },
- { 3639419, 105014 },
- { 3711860, 105498 },
- { 3785467, 105980 },
- { 3860253, 106462 },
- { 3936229, 106942 },
- { 4013410, 107422 },
- { 4091808, 107902 },
- { 4171435, 108380 },
- { 4252306, 108858 },
- { 4334431, 109335 },
- { 4417825, 109811 },
- { 4502501, 110287 },
- { 4588472, 110762 },
- { 4675750, 111236 },
- { 4764349, 111709 },
- { 4854283, 112182 },
- { 4945564, 112654 },
- { 5038206, 113126 },
- { 5132223, 113597 },
- { 5227627, 114067 },
- { 5324432, 114537 },
- { 5422652, 115006 },
- { 5522299, 115474 },
- { 5623389, 115942 },
- { 5725934, 116409 },
- { 5829948, 116876 },
- { 5935446, 117342 },
- { 6042439, 117808 },
- { 6150943, 118273 },
- { 6260972, 118738 },
- { 6372538, 119202 },
- { 6485657, 119665 },
- { 6600342, 120128 },
- { 6716607, 120591 },
- { 6834467, 121053 },
- { 6953935, 121514 },
- { 7075025, 121976 },
- { 7197752, 122436 },
- { 7322131, 122896 },
- { 7448175, 123356 },
- { 7575898, 123815 },
- { 7705316, 124274 },
- { 7836442, 124733 },
- { 7969291, 125191 },
- { 8103877, 125648 },
- { 8240216, 126105 },
- { 8378321, 126562 },
- { 8518208, 127018 },
- { 8659890, 127474 },
- { 8803384, 127930 },
- { 8948702, 128385 },
- { 9095861, 128840 },
- { 9244875, 129294 },
- { 9395760, 129748 },
- { 9548529, 130202 },
- { 9703198, 130655 },
- { 9859782, 131108 },
- { 10018296, 131561 },
- { 10178755, 132014 },
- { 10341174, 132466 },
- { 10505569, 132917 },
- { 10671954, 133369 },
- { 10840345, 133820 },
- { 11010757, 134271 },
- { 11183206, 134721 },
- { 11357706, 135171 },
- { 11534274, 135621 },
- { 11712924, 136071 },
- { 11893673, 136520 },
- { 12076536, 136969 },
- { 12261527, 137418 },
- { 12448664, 137867 },
- { 12637961, 138315 },
- { 12829435, 138763 },
- { 13023101, 139211 },
- { 13218974, 139658 },
- { 13417071, 140106 },
- { 13617407, 140553 },
- { 13819999, 140999 },
- { 14024862, 141446 },
- { 14232012, 141892 },
- { 14441465, 142339 },
- { 14653238, 142785 },
- { 14867346, 143230 },
- { 15083805, 143676 },
- { 15302632, 144121 },
- { 15523842, 144566 },
- { 15747453, 145011 },
- { 15973479, 145456 },
- { 16201939, 145900 },
- { 16432847, 146345 },
- { 16666221, 146789 },
- { 16902076, 147233 },
- { 17140429, 147677 },
- { 17381297, 148121 },
- { 17624696, 148564 },
- { 17870643, 149007 },
- { 18119154, 149451 },
- { 18370247, 149894 },
- { 18623936, 150336 },
- { 18880241, 150779 },
- { 19139176, 151222 },
- { 19400759, 151664 },
- { 19665007, 152107 },
- { 19931936, 152549 },
- { 20201564, 152991 },
- { 20473907, 153433 },
- { 20748982, 153875 },
- { 21026807, 154316 },
- { 21307399, 154758 },
- { 21590773, 155199 },
- { 21876949, 155641 },
- { 22165941, 156082 },
- { 22457769, 156523 },
- { 22752449, 156964 },
- { 23049999, 157405 },
- { 23350435, 157846 },
- { 23653774, 158287 },
- { 23960036, 158727 },
- { 24269236, 159168 },
- { 24581392, 159608 },
- { 24896521, 160049 },
- { 25214642, 160489 },
- { 25535772, 160929 },
- { 25859927, 161370 },
- { 26187127, 161810 },
- { 26517388, 162250 },
- { 26850728, 162690 },
- { 27187165, 163130 },
- { 27526716, 163569 },
- { 27869400, 164009 },
- { 28215234, 164449 },
- { 28564236, 164889 },
- { 28916423, 165328 },
- { 29271815, 165768 },
- { 29630428, 166208 },
- { 29992281, 166647 },
- { 30357392, 167087 },
- { 30725779, 167526 },
- { 31097459, 167965 },
- { 31472452, 168405 },
- { 31850774, 168844 },
- { 32232445, 169283 },
- { 32617482, 169723 },
- { 33005904, 170162 },
- { 33397730, 170601 },
- { 33792976, 171041 },
- { 34191663, 171480 },
- { 34593807, 171919 },
- { 34999428, 172358 },
- { 35408544, 172797 },
- { 35821174, 173237 },
- { 36237335, 173676 },
- { 36657047, 174115 },
- { 37080329, 174554 },
- { 37507197, 174993 },
- { 37937673, 175433 },
- { 38371773, 175872 },
- { 38809517, 176311 },
- { 39250924, 176750 },
- { 39696012, 177190 },
- { 40144800, 177629 },
- { 40597308, 178068 },
- { 41053553, 178507 },
- { 41513554, 178947 },
- { 41977332, 179386 },
- { 42444904, 179825 },
- { 42916290, 180265 },
- { 43391509, 180704 },
- { 43870579, 181144 },
- { 44353520, 181583 },
- { 44840352, 182023 },
- { 45331092, 182462 },
- { 45825761, 182902 },
- { 46324378, 183342 },
- { 46826961, 183781 },
- { 47333531, 184221 },
- { 47844106, 184661 },
- { 48358706, 185101 },
- { 48877350, 185541 },
- { 49400058, 185981 },
- { 49926849, 186421 },
- { 50457743, 186861 },
- { 50992759, 187301 },
- { 51531916, 187741 },
- { 52075235, 188181 },
- { 52622735, 188622 },
- { 53174435, 189062 },
- { 53730355, 189502 },
- { 54290515, 189943 },
- { 54854935, 190383 },
- { 55423634, 190824 },
- { 55996633, 191265 },
- { 56573950, 191706 },
- { 57155606, 192146 },
- { 57741621, 192587 },
- { 58332014, 193028 },
- { 58926806, 193470 },
- { 59526017, 193911 },
- { 60129666, 194352 },
- { 60737774, 194793 },
- { 61350361, 195235 },
- { 61967446, 195677 },
- { 62589050, 196118 },
- { 63215194, 196560 },
- { 63845897, 197002 },
- { 64481179, 197444 },
- { 65121061, 197886 },
- { 65765563, 198328 },
- { 66414705, 198770 },
- { 67068508, 199213 },
- { 67726992, 199655 },
- { 68390177, 200098 },
- { 69058085, 200540 },
- { 69730735, 200983 },
- { 70408147, 201426 },
- { 71090343, 201869 },
- { 71777343, 202312 },
- { 72469168, 202755 },
- { 73165837, 203199 },
- { 73867373, 203642 },
- { 74573795, 204086 },
- { 75285124, 204529 },
- { 76001380, 204973 },
- { 76722586, 205417 },
- { 77448761, 205861 },
- { 78179926, 206306 },
- { 78916102, 206750 },
- { 79657310, 207194 },
- { 80403571, 207639 },
- { 81154906, 208084 },
- { 81911335, 208529 },
- { 82672880, 208974 },
- { 83439562, 209419 },
- { 84211402, 209864 },
- { 84988421, 210309 },
- { 85770640, 210755 },
- { 86558080, 211201 },
- { 87350762, 211647 },
- { 88148708, 212093 },
- { 88951938, 212539 },
- { 89760475, 212985 },
- { 90574339, 213432 },
- { 91393551, 213878 },
- { 92218133, 214325 },
- { 93048107, 214772 },
- { 93883493, 215219 },
- { 94724314, 215666 },
- { 95570590, 216114 },
- { 96422343, 216561 },
- { 97279594, 217009 },
- { 98142366, 217457 },
- { 99010679, 217905 },
- { 99884556, 218353 },
- { 100764018, 218801 },
- { 101649086, 219250 },
- { 102539782, 219698 },
- { 103436128, 220147 },
- { 104338146, 220596 },
- { 105245857, 221046 },
- { 106159284, 221495 },
- { 107078448, 221945 },
- { 108003370, 222394 },
- { 108934074, 222844 },
- { 109870580, 223294 },
- { 110812910, 223745 },
- { 111761087, 224195 },
- { 112715133, 224646 },
- { 113675069, 225097 },
- { 114640918, 225548 },
- { 115612702, 225999 },
- { 116590442, 226450 },
- { 117574162, 226902 },
- { 118563882, 227353 },
- { 119559626, 227805 },
- { 120561415, 228258 },
- { 121569272, 228710 },
- { 122583219, 229162 },
- { 123603278, 229615 },
- { 124629471, 230068 },
- { 125661822, 230521 },
- { 126700352, 230974 },
- { 127745083, 231428 },
- { 128796039, 231882 },
- { 129853241, 232336 },
- { 130916713, 232790 },
- { 131986475, 233244 },
- { 133062553, 233699 },
- { 134144966, 234153 },
- { 135233739, 234608 },
- { 136328894, 235064 },
- { 137430453, 235519 },
- { 138538440, 235975 },
- { 139652876, 236430 },
- { 140773786, 236886 },
- { 141901190, 237343 },
- { 143035113, 237799 },
- { 144175576, 238256 },
- { 145322604, 238713 },
- { 146476218, 239170 },
- { 147636442, 239627 },
- { 148803298, 240085 },
- { 149976809, 240542 },
- { 151156999, 241000 },
- { 152343890, 241459 },
- { 153537506, 241917 },
- { 154737869, 242376 },
- { 155945002, 242835 },
- { 157158929, 243294 },
- { 158379673, 243753 },
- { 159607257, 244213 },
- { 160841704, 244673 },
- { 162083037, 245133 },
- { 163331279, 245593 },
- { 164586455, 246054 },
- { 165848586, 246514 },
- { 167117696, 246975 },
- { 168393810, 247437 },
- { 169676949, 247898 },
- { 170967138, 248360 },
- { 172264399, 248822 },
- { 173568757, 249284 },
- { 174880235, 249747 },
- { 176198856, 250209 },
- { 177524643, 250672 },
- { 178857621, 251136 },
- { 180197813, 251599 },
- { 181545242, 252063 },
- { 182899933, 252527 },
- { 184261908, 252991 },
- { 185631191, 253456 },
- { 187007807, 253920 },
- { 188391778, 254385 },
- { 189783129, 254851 },
- { 191181884, 255316 },
- { 192588065, 255782 },
- { 194001698, 256248 },
- { 195422805, 256714 },
- { 196851411, 257181 },
- { 198287540, 257648 },
- { 199731215, 258115 },
- { 201182461, 258582 },
- { 202641302, 259050 },
- { 204107760, 259518 },
- { 205581862, 259986 },
- { 207063630, 260454 },
- { 208553088, 260923 },
- { 210050262, 261392 },
- { 211555174, 261861 },
- { 213067849, 262331 },
- { 214588312, 262800 },
- { 216116586, 263270 },
- { 217652696, 263741 },
- { 219196666, 264211 },
- { 220748520, 264682 },
- { 222308282, 265153 },
- { 223875978, 265625 },
- { 225451630, 266097 },
- { 227035265, 266569 },
- { 228626905, 267041 },
- { 230226576, 267514 },
- { 231834302, 267986 },
- { 233450107, 268460 },
- { 235074016, 268933 },
- { 236706054, 269407 },
- { 238346244, 269881 },
- { 239994613, 270355 },
- { 241651183, 270830 },
- { 243315981, 271305 }
-};
-
-/* return largest index i such that fval <= lookup[i][small] */
-static inline u32 tfrc_binsearch(u32 fval, u8 small)
-{
- u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
-
- while (low < high) {
- try = (low + high) / 2;
- if (fval <= tfrc_calc_x_lookup[try][small])
- high = try;
- else
- low = try + 1;
- }
- return high;
-}
-
-/**
- * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
- * @s: packet size in bytes
- * @R: RTT scaled by 1000000 (i.e., microseconds)
- * @p: loss ratio estimate scaled by 1000000
- *
- * Returns X_calc in bytes per second (not scaled).
- */
-u32 tfrc_calc_x(u16 s, u32 R, u32 p)
-{
- u16 index;
- u32 f;
- u64 result;
-
- /* check against invalid parameters and divide-by-zero */
- BUG_ON(p > 1000000); /* p must not exceed 100% */
- BUG_ON(p == 0); /* f(0) = 0, divide by zero */
- if (R == 0) { /* possible divide by zero */
- DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
- return ~0U;
- }
-
- if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
- if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
- DCCP_WARN("Value of p (%d) below resolution. "
- "Substituting %d\n", p, TFRC_SMALLEST_P);
- index = 0;
- } else /* 0.0001 <= p <= 0.05 */
- index = p/TFRC_SMALLEST_P - 1;
-
- f = tfrc_calc_x_lookup[index][1];
-
- } else { /* 0.05 < p <= 1.00 */
- index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
-
- f = tfrc_calc_x_lookup[index][0];
- }
-
- /*
- * Compute X = s/(R*f(p)) in bytes per second.
- * Since f(p) and R are both scaled by 1000000, we need to multiply by
- * 1000000^2. To avoid overflow, the result is computed in two stages.
- * This works under almost all reasonable operational conditions, for a
- * wide range of parameters. Yet, should some strange combination of
- * parameters result in overflow, the use of scaled_div32 will catch
- * this and return UINT_MAX - which is a logically adequate consequence.
- */
- result = scaled_div(s, R);
- return scaled_div32(result, f);
-}
-
-/**
- * tfrc_calc_x_reverse_lookup - try to find p given f(p)
- * @fvalue: function value to match, scaled by 1000000
- *
- * Returns closest match for p, also scaled by 1000000
- */
-u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
-{
- int index;
-
- if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
- return 0;
-
- /* Error cases. */
- if (fvalue < tfrc_calc_x_lookup[0][1]) {
- DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
- return TFRC_SMALLEST_P;
- }
- if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
- DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
- return 1000000;
- }
-
- if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
- index = tfrc_binsearch(fvalue, 1);
- return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
- }
-
- /* else ... it must be in the coarse-grained column */
- index = tfrc_binsearch(fvalue, 0);
- return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
-}
-
-/**
- * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
- * @loss_event_rate: loss event rate to invert
- * When @loss_event_rate is large, there is a chance that p is truncated to 0.
- * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
- */
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
-{
- if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
- return 0;
- if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
- return 1000000;
- return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
-}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
deleted file mode 100644
index 1f748ed1279d..000000000000
--- a/net/dccp/dccp.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _DCCP_H
-#define _DCCP_H
-/*
- * net/dccp/dccp.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- */
-
-#include <linux/dccp.h>
-#include <linux/ktime.h>
-#include <net/snmp.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include "ackvec.h"
-
-/*
- * DCCP - specific warning and debugging macros.
- */
-#define DCCP_WARN(fmt, ...) \
- net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__)
-#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
- __FILE__, __LINE__, __func__)
-#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
-#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
- DCCP_BUG("\"%s\" holds (exception!)", \
- __stringify(cond)); \
- } while (0)
-
-#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
- printk(fmt, ##args); \
- } while(0)
-#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
- "%s: " fmt, __func__, ##a)
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern bool dccp_debug;
-#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
-#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
-#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
-#else
-#define dccp_pr_debug(format, a...) do {} while (0)
-#define dccp_pr_debug_cat(format, a...) do {} while (0)
-#define dccp_debug(format, a...) do {} while (0)
-#endif
-
-extern struct inet_hashinfo dccp_hashinfo;
-
-DECLARE_PER_CPU(unsigned int, dccp_orphan_count);
-
-void dccp_time_wait(struct sock *sk, int state, int timeo);
-
-/*
- * Set safe upper bounds for header and option length. Since Data Offset is 8
- * bits (RFC 4340, sec. 5.1), the total header length can never be more than
- * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
- * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
- * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
- * Hence a safe upper bound for the maximum option length is 1020-28 = 992
- */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
-#define DCCP_MAX_PACKET_HDR 28
-#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
-#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
-
-/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
-#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
-
-#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
- * state, about 60 seconds */
-
-/* RFC 1122, 4.2.3.1 initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
-
-/*
- * The maximum back-off value for retransmissions. This is needed for
- * - retransmitting client-Requests (sec. 8.1.1),
- * - retransmitting Close/CloseReq when closing (sec. 8.3),
- * - feature-negotiation retransmission (sec. 6.6.3),
- * - Acks in client-PARTOPEN state (sec. 8.1.5).
- */
-#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
-
-/*
- * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
- */
-#define DCCP_SANE_RTT_MIN 100
-#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
-#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
-
-/* sysctl variables for DCCP */
-extern int sysctl_dccp_request_retries;
-extern int sysctl_dccp_retries1;
-extern int sysctl_dccp_retries2;
-extern int sysctl_dccp_tx_qlen;
-extern int sysctl_dccp_sync_ratelimit;
-
-/*
- * 48-bit sequence number arithmetic (signed and unsigned)
- */
-#define INT48_MIN 0x800000000000LL /* 2^47 */
-#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */
-#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */
-#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
-#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x)))
-#define ADD48(a, b) (((a) + (b)) & UINT48_MAX)
-#define SUB48(a, b) ADD48((a), COMPLEMENT48(b))
-
-static inline void dccp_inc_seqno(u64 *seqno)
-{
- *seqno = ADD48(*seqno, 1);
-}
-
-/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
-static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
-{
- u64 delta = SUB48(seqno2, seqno1);
-
- return TO_SIGNED48(delta);
-}
-
-/* is seq1 < seq2 ? */
-static inline int before48(const u64 seq1, const u64 seq2)
-{
- return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
-}
-
-/* is seq1 > seq2 ? */
-#define after48(seq1, seq2) before48(seq2, seq1)
-
-/* is seq2 <= seq1 <= seq3 ? */
-static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
-{
- return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
-}
-
-/**
- * dccp_loss_count - Approximate the number of lost data packets in a burst loss
- * @s1: last known sequence number before the loss ('hole')
- * @s2: first sequence number seen after the 'hole'
- * @ndp: NDP count on packet with sequence number @s2
- */
-static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
-{
- s64 delta = dccp_delta_seqno(s1, s2);
-
- WARN_ON(delta < 0);
- delta -= ndp + 1;
-
- return delta > 0 ? delta : 0;
-}
-
-/**
- * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
- */
-static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
-{
- return dccp_loss_count(s1, s2, ndp) == 0;
-}
-
-enum {
- DCCP_MIB_NUM = 0,
- DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
- DCCP_MIB_ESTABRESETS, /* EstabResets */
- DCCP_MIB_CURRESTAB, /* CurrEstab */
- DCCP_MIB_OUTSEGS, /* OutSegs */
- DCCP_MIB_OUTRSTS,
- DCCP_MIB_ABORTONTIMEOUT,
- DCCP_MIB_TIMEOUTS,
- DCCP_MIB_ABORTFAILED,
- DCCP_MIB_PASSIVEOPENS,
- DCCP_MIB_ATTEMPTFAILS,
- DCCP_MIB_OUTDATAGRAMS,
- DCCP_MIB_INERRS,
- DCCP_MIB_OPTMANDATORYERROR,
- DCCP_MIB_INVALIDOPT,
- __DCCP_MIB_MAX
-};
-
-#define DCCP_MIB_MAX __DCCP_MIB_MAX
-struct dccp_mib {
- unsigned long mibs[DCCP_MIB_MAX];
-};
-
-DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
-#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
-#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field)
-#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-
-/*
- * Checksumming routines
- */
-static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
-{
- const struct dccp_hdr* dh = dccp_hdr(skb);
-
- if (dh->dccph_cscov == 0)
- return skb->len;
- return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
-}
-
-static inline void dccp_csum_outgoing(struct sk_buff *skb)
-{
- unsigned int cov = dccp_csum_coverage(skb);
-
- if (cov >= skb->len)
- dccp_hdr(skb)->dccph_cscov = 0;
-
- skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
-}
-
-void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
-
-int dccp_retransmit_skb(struct sock *sk);
-
-void dccp_send_ack(struct sock *sk);
-void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *rsk);
-
-void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
-
-/*
- * TX Packet Dequeueing Interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
-bool dccp_qpolicy_full(struct sock *sk);
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
-struct sk_buff *dccp_qpolicy_top(struct sock *sk);
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
-
-/*
- * TX Packet Output and TX Timers
- */
-void dccp_write_xmit(struct sock *sk);
-void dccp_write_space(struct sock *sk);
-void dccp_flush_write_queue(struct sock *sk, long *time_budget);
-
-void dccp_init_xmit_timers(struct sock *sk);
-static inline void dccp_clear_xmit_timers(struct sock *sk)
-{
- inet_csk_clear_xmit_timers(sk);
-}
-
-unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-
-const char *dccp_packet_name(const int type);
-
-void dccp_set_state(struct sock *sk, const int state);
-void dccp_done(struct sock *sk);
-
-int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
- struct sk_buff const *skb);
-
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-
-struct sock *dccp_create_openreq_child(const struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
-
-int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req);
-struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req);
-
-int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned int len);
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len);
-
-void dccp_destruct_common(struct sock *sk);
-int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
-void dccp_destroy_sock(struct sock *sk);
-
-void dccp_close(struct sock *sk, long timeout);
-struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
- struct request_sock *req);
-
-int dccp_connect(struct sock *sk);
-int dccp_disconnect(struct sock *sk, int flags);
-int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-int dccp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen);
-int dccp_ioctl(struct sock *sk, int cmd, int *karg);
-int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
- int *addr_len);
-void dccp_shutdown(struct sock *sk, int how);
-int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
-void dccp_req_err(struct sock *sk, u64 seq);
-
-struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
-int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
-void dccp_send_close(struct sock *sk, const int active);
-int dccp_invalid_packet(struct sk_buff *skb);
-u32 dccp_sample_rtt(struct sock *sk, long delta);
-
-static inline bool dccp_bad_service_code(const struct sock *sk,
- const __be32 service)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- if (dp->dccps_service == service)
- return false;
- return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
-/**
- * dccp_skb_cb - DCCP per-packet control information
- * @dccpd_type: one of %dccp_pkt_type (or unknown)
- * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
- * @dccpd_reset_code: one of %dccp_reset_codes
- * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
- * @dccpd_opt_len: total length of all options (5.8) in the packet
- * @dccpd_seq: sequence number
- * @dccpd_ack_seq: acknowledgment number subheader field value
- *
- * This is used for transmission as well as for reception.
- */
-struct dccp_skb_cb {
- union {
- struct inet_skb_parm h4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_skb_parm h6;
-#endif
- } header;
- __u8 dccpd_type:4;
- __u8 dccpd_ccval:4;
- __u8 dccpd_reset_code,
- dccpd_reset_data[3];
- __u16 dccpd_opt_len;
- __u64 dccpd_seq;
- __u64 dccpd_ack_seq;
-};
-
-#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
-
-/* RFC 4340, sec. 7.7 */
-static inline int dccp_non_data_packet(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_ACK ||
- type == DCCP_PKT_CLOSE ||
- type == DCCP_PKT_CLOSEREQ ||
- type == DCCP_PKT_RESET ||
- type == DCCP_PKT_SYNC ||
- type == DCCP_PKT_SYNCACK;
-}
-
-/* RFC 4340, sec. 7.7 */
-static inline int dccp_data_packet(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_DATA ||
- type == DCCP_PKT_DATAACK ||
- type == DCCP_PKT_REQUEST ||
- type == DCCP_PKT_RESPONSE;
-}
-
-static inline int dccp_packet_without_ack(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
-}
-
-#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
-
-static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
-{
- struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
- sizeof(*dh));
- dh->dccph_seq2 = 0;
- dh->dccph_seq = htons((gss >> 32) & 0xfffff);
- dhx->dccph_seq_low = htonl(gss & 0xffffffff);
-}
-
-static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
- const u64 gsr)
-{
- dhack->dccph_reserved1 = 0;
- dhack->dccph_ack_nr_high = htons(gsr >> 32);
- dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
-}
-
-static inline void dccp_update_gsr(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (after48(seq, dp->dccps_gsr))
- dp->dccps_gsr = seq;
- /* Sequence validity window depends on remote Sequence Window (7.5.1) */
- dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
- /*
- * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
- * 7.5.1 we perform this check beyond the initial handshake: W/W' are
- * always > 32, so for the first W/W' packets in the lifetime of a
- * connection we always have to adjust SWL.
- * A second reason why we are doing this is that the window depends on
- * the feature-remote value of Sequence Window: nothing stops the peer
- * from updating this value while we are busy adjusting SWL for the
- * first W packets (we would have to count from scratch again then).
- * Therefore it is safer to always make sure that the Sequence Window
- * is not artificially extended by a peer who grows SWL downwards by
- * continually updating the feature-remote Sequence-Window.
- * If sequence numbers wrap it is bad luck. But that will take a while
- * (48 bit), and this measure prevents Sequence-number attacks.
- */
- if (before48(dp->dccps_swl, dp->dccps_isr))
- dp->dccps_swl = dp->dccps_isr;
- dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
-}
-
-static inline void dccp_update_gss(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_gss = seq;
- /* Ack validity window depends on local Sequence Window value (7.5.1) */
- dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
- /* Adjust AWL so that it is not below ISS - see comment above for SWL */
- if (before48(dp->dccps_awl, dp->dccps_iss))
- dp->dccps_awl = dp->dccps_iss;
- dp->dccps_awh = dp->dccps_gss;
-}
-
-static inline int dccp_ackvec_pending(const struct sock *sk)
-{
- return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
- !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
-}
-
-static inline int dccp_ack_pending(const struct sock *sk)
-{
- return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
-}
-
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
-int dccp_feat_finalise_settings(struct dccp_sock *dp);
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
-int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
- struct sk_buff *skb);
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
-void dccp_feat_list_purge(struct list_head *fn_list);
-
-int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
-u32 dccp_timestamp(void);
-void dccp_timestamping_init(void);
-int dccp_insert_option(struct sk_buff *skb, unsigned char option,
- const void *value, unsigned char len);
-
-#ifdef CONFIG_SYSCTL
-int dccp_sysctl_init(void);
-void dccp_sysctl_exit(void);
-#else
-static inline int dccp_sysctl_init(void)
-{
- return 0;
-}
-
-static inline void dccp_sysctl_exit(void)
-{
-}
-#endif
-
-#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
deleted file mode 100644
index f5019d95c3ae..000000000000
--- a/net/dccp/diag.c
+++ /dev/null
@@ -1,85 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/diag.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- */
-
-
-#include <linux/module.h>
-#include <linux/inet_diag.h>
-
-#include "ccid.h"
-#include "dccp.h"
-
-static void dccp_get_info(struct sock *sk, struct tcp_info *info)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- memset(info, 0, sizeof(*info));
-
- info->tcpi_state = sk->sk_state;
- info->tcpi_retransmits = icsk->icsk_retransmits;
- info->tcpi_probes = icsk->icsk_probes_out;
- info->tcpi_backoff = icsk->icsk_backoff;
- info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
-
- if (dp->dccps_hc_rx_ackvec != NULL)
- info->tcpi_options |= TCPI_OPT_SACK;
-
- if (dp->dccps_hc_rx_ccid != NULL)
- ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
-
- if (dp->dccps_hc_tx_ccid != NULL)
- ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
-}
-
-static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
- void *_info)
-{
- r->idiag_rqueue = r->idiag_wqueue = 0;
-
- if (_info != NULL)
- dccp_get_info(sk, _info);
-}
-
-static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r)
-{
- inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r);
-}
-
-static int dccp_diag_dump_one(struct netlink_callback *cb,
- const struct inet_diag_req_v2 *req)
-{
- return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req);
-}
-
-static const struct inet_diag_handler dccp_diag_handler = {
- .owner = THIS_MODULE,
- .dump = dccp_diag_dump,
- .dump_one = dccp_diag_dump_one,
- .idiag_get_info = dccp_diag_get_info,
- .idiag_type = IPPROTO_DCCP,
- .idiag_info_size = sizeof(struct tcp_info),
-};
-
-static int __init dccp_diag_init(void)
-{
- return inet_diag_register(&dccp_diag_handler);
-}
-
-static void __exit dccp_diag_fini(void)
-{
- inet_diag_unregister(&dccp_diag_handler);
-}
-
-module_init(dccp_diag_init);
-module_exit(dccp_diag_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP inet_diag handler");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
deleted file mode 100644
index f7554dcdaaba..000000000000
--- a/net/dccp/feat.c
+++ /dev/null
@@ -1,1581 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/feat.c
- *
- * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *
- * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- * Rewrote from scratch, some bits from earlier code by
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * ASSUMPTIONS
- * -----------
- * o Feature negotiation is coordinated with connection setup (as in TCP), wild
- * changes of parameters of an established connection are not supported.
- * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
- * o All currently known SP features have 1-byte quantities. If in the future
- * extensions of RFCs 4340..42 define features with item lengths larger than
- * one byte, a feature-specific extension of the code will be required.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include "ccid.h"
-#include "feat.h"
-
-/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
-unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
-int sysctl_dccp_rx_ccid __read_mostly = 2,
- sysctl_dccp_tx_ccid __read_mostly = 2;
-
-/*
- * Feature activation handlers.
- *
- * These all use an u64 argument, to provide enough room for NN/SP features. At
- * this stage the negotiated values have been checked to be within their range.
- */
-static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid *new_ccid = ccid_new(ccid, sk, rx);
-
- if (new_ccid == NULL)
- return -ENOMEM;
-
- if (rx) {
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = new_ccid;
- } else {
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_tx_ccid = new_ccid;
- }
- return 0;
-}
-
-static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx) {
- dp->dccps_r_seq_win = seq_win;
- /* propagate changes to update SWL/SWH */
- dccp_update_gsr(sk, dp->dccps_gsr);
- } else {
- dp->dccps_l_seq_win = seq_win;
- /* propagate changes to update AWL */
- dccp_update_gss(sk, dp->dccps_gss);
- }
- return 0;
-}
-
-static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
-{
- if (rx)
- dccp_sk(sk)->dccps_r_ack_ratio = ratio;
- else
- dccp_sk(sk)->dccps_l_ack_ratio = ratio;
- return 0;
-}
-
-static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx) {
- if (enable && dp->dccps_hc_rx_ackvec == NULL) {
- dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
- if (dp->dccps_hc_rx_ackvec == NULL)
- return -ENOMEM;
- } else if (!enable) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- }
- return 0;
-}
-
-static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
-{
- if (!rx)
- dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
- return 0;
-}
-
-/*
- * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
- * `rx' holds when the sending peer informs about his partial coverage via a
- * ChangeR() option. In the other case, we are the sender and the receiver
- * announces its coverage via ChangeL() options. The policy here is to honour
- * such communication by enabling the corresponding partial coverage - but only
- * if it has not been set manually before; the warning here means that all
- * packets will be dropped.
- */
-static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx)
- dp->dccps_pcrlen = cscov;
- else {
- if (dp->dccps_pcslen == 0)
- dp->dccps_pcslen = cscov;
- else if (cscov > dp->dccps_pcslen)
- DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
- dp->dccps_pcslen, (u8)cscov);
- }
- return 0;
-}
-
-static const struct {
- u8 feat_num; /* DCCPF_xxx */
- enum dccp_feat_type rxtx; /* RX or TX */
- enum dccp_feat_type reconciliation; /* SP or NN */
- u8 default_value; /* as in 6.4 */
- int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
-/*
- * Lookup table for location and type of features (from RFC 4340/4342)
- * +--------------------------+----+-----+----+----+---------+-----------+
- * | Feature | Location | Reconc. | Initial | Section |
- * | | RX | TX | SP | NN | Value | Reference |
- * +--------------------------+----+-----+----+----+---------+-----------+
- * | DCCPF_CCID | | X | X | | 2 | 10 |
- * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
- * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
- * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
- * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
- * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
- * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
- * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
- * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
- * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
- * +--------------------------+----+-----+----+----+---------+-----------+
- */
-} dccp_feat_table[] = {
- { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
- { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
- { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
- { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
- { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
- { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
- { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
- { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
- { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
- { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
-};
-#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
-
-/**
- * dccp_feat_index - Hash function to map feature number into array position
- * @feat_num: feature to hash, one of %dccp_feature_numbers
- *
- * Returns consecutive array index or -1 if the feature is not understood.
- */
-static int dccp_feat_index(u8 feat_num)
-{
- /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
- if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
- return feat_num - 1;
-
- /*
- * Other features: add cases for new feature types here after adding
- * them to the above table.
- */
- switch (feat_num) {
- case DCCPF_SEND_LEV_RATE:
- return DCCP_FEAT_SUPPORTED_MAX - 1;
- }
- return -1;
-}
-
-static u8 dccp_feat_type(u8 feat_num)
-{
- int idx = dccp_feat_index(feat_num);
-
- if (idx < 0)
- return FEAT_UNKNOWN;
- return dccp_feat_table[idx].reconciliation;
-}
-
-static int dccp_feat_default_value(u8 feat_num)
-{
- int idx = dccp_feat_index(feat_num);
- /*
- * There are no default values for unknown features, so encountering a
- * negative index here indicates a serious problem somewhere else.
- */
- DCCP_BUG_ON(idx < 0);
-
- return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
-}
-
-/*
- * Debugging and verbose-printing section
- */
-static const char *dccp_feat_fname(const u8 feat)
-{
- static const char *const feature_names[] = {
- [DCCPF_RESERVED] = "Reserved",
- [DCCPF_CCID] = "CCID",
- [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
- [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
- [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
- [DCCPF_ACK_RATIO] = "Ack Ratio",
- [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
- [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
- [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
- [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
- };
- if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
- return feature_names[DCCPF_RESERVED];
-
- if (feat == DCCPF_SEND_LEV_RATE)
- return "Send Loss Event Rate";
- if (feat >= DCCPF_MIN_CCID_SPECIFIC)
- return "CCID-specific";
-
- return feature_names[feat];
-}
-
-static const char *const dccp_feat_sname[] = {
- "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
-};
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_feat_oname(const u8 opt)
-{
- switch (opt) {
- case DCCPO_CHANGE_L: return "Change_L";
- case DCCPO_CONFIRM_L: return "Confirm_L";
- case DCCPO_CHANGE_R: return "Change_R";
- case DCCPO_CONFIRM_R: return "Confirm_R";
- }
- return NULL;
-}
-
-static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
-{
- u8 i, type = dccp_feat_type(feat_num);
-
- if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
- dccp_pr_debug_cat("(NULL)");
- else if (type == FEAT_SP)
- for (i = 0; i < val->sp.len; i++)
- dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
- else if (type == FEAT_NN)
- dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
- else
- dccp_pr_debug_cat("unknown type %u", type);
-}
-
-static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
-{
- u8 type = dccp_feat_type(feat_num);
- dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
-
- if (type == FEAT_NN)
- fval.nn = dccp_decode_value_var(list, len);
- dccp_feat_printval(feat_num, &fval);
-}
-
-static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
-{
- dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
- dccp_feat_fname(entry->feat_num));
- dccp_feat_printval(entry->feat_num, &entry->val);
- dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
- entry->needs_confirm ? "(Confirm pending)" : "");
-}
-
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
- dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
- dccp_feat_printvals(feat, val, len); \
- dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
-
-#define dccp_feat_print_fnlist(fn_list) { \
- const struct dccp_feat_entry *___entry; \
- \
- dccp_pr_debug("List Dump:\n"); \
- list_for_each_entry(___entry, fn_list, node) \
- dccp_feat_print_entry(___entry); \
-}
-#else /* ! CONFIG_IP_DCCP_DEBUG */
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
-#define dccp_feat_print_fnlist(fn_list)
-#endif
-
-static int __dccp_feat_activate(struct sock *sk, const int idx,
- const bool is_local, dccp_feat_val const *fval)
-{
- bool rx;
- u64 val;
-
- if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
- return -1;
- if (dccp_feat_table[idx].activation_hdlr == NULL)
- return 0;
-
- if (fval == NULL) {
- val = dccp_feat_table[idx].default_value;
- } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
- if (fval->sp.vec == NULL) {
- /*
- * This can happen when an empty Confirm is sent
- * for an SP (i.e. known) feature. In this case
- * we would be using the default anyway.
- */
- DCCP_CRIT("Feature #%d undefined: using default", idx);
- val = dccp_feat_table[idx].default_value;
- } else {
- val = fval->sp.vec[0];
- }
- } else {
- val = fval->nn;
- }
-
- /* Location is RX if this is a local-RX or remote-TX feature */
- rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
-
- dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
- dccp_feat_fname(dccp_feat_table[idx].feat_num),
- fval ? "" : "default ", (unsigned long long)val);
-
- return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
-}
-
-/**
- * dccp_feat_activate - Activate feature value on socket
- * @sk: fully connected DCCP socket (after handshake is complete)
- * @feat_num: feature to activate, one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @fval: the value (SP or NN) to activate, or NULL to use the default value
- *
- * For general use this function is preferable over __dccp_feat_activate().
- */
-static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
- dccp_feat_val const *fval)
-{
- return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
-}
-
-/* Test for "Req'd" feature (RFC 4340, 6.4) */
-static inline int dccp_feat_must_be_understood(u8 feat_num)
-{
- return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
- feat_num == DCCPF_SEQUENCE_WINDOW;
-}
-
-/* copy constructor, fval must not already contain allocated memory */
-static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
-{
- fval->sp.len = len;
- if (fval->sp.len > 0) {
- fval->sp.vec = kmemdup(val, len, gfp_any());
- if (fval->sp.vec == NULL) {
- fval->sp.len = 0;
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
-{
- if (unlikely(val == NULL))
- return;
- if (dccp_feat_type(feat_num) == FEAT_SP)
- kfree(val->sp.vec);
- memset(val, 0, sizeof(*val));
-}
-
-static struct dccp_feat_entry *
- dccp_feat_clone_entry(struct dccp_feat_entry const *original)
-{
- struct dccp_feat_entry *new;
- u8 type = dccp_feat_type(original->feat_num);
-
- if (type == FEAT_UNKNOWN)
- return NULL;
-
- new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
- if (new == NULL)
- return NULL;
-
- if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
- original->val.sp.vec,
- original->val.sp.len)) {
- kfree(new);
- return NULL;
- }
- return new;
-}
-
-static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
-{
- if (entry != NULL) {
- dccp_feat_val_destructor(entry->feat_num, &entry->val);
- kfree(entry);
- }
-}
-
-/*
- * List management functions
- *
- * Feature negotiation lists rely on and maintain the following invariants:
- * - each feat_num in the list is known, i.e. we know its type and default value
- * - each feat_num/is_local combination is unique (old entries are overwritten)
- * - SP values are always freshly allocated
- * - list is sorted in increasing order of feature number (faster lookup)
- */
-static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
- u8 feat_num, bool is_local)
-{
- struct dccp_feat_entry *entry;
-
- list_for_each_entry(entry, fn_list, node) {
- if (entry->feat_num == feat_num && entry->is_local == is_local)
- return entry;
- else if (entry->feat_num > feat_num)
- break;
- }
- return NULL;
-}
-
-/**
- * dccp_feat_entry_new - Central list update routine (called by all others)
- * @head: list to add to
- * @feat: feature number
- * @local: whether the local (1) or remote feature with number @feat is meant
- *
- * This is the only constructor and serves to ensure the above invariants.
- */
-static struct dccp_feat_entry *
- dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
-{
- struct dccp_feat_entry *entry;
-
- list_for_each_entry(entry, head, node)
- if (entry->feat_num == feat && entry->is_local == local) {
- dccp_feat_val_destructor(entry->feat_num, &entry->val);
- return entry;
- } else if (entry->feat_num > feat) {
- head = &entry->node;
- break;
- }
-
- entry = kmalloc(sizeof(*entry), gfp_any());
- if (entry != NULL) {
- entry->feat_num = feat;
- entry->is_local = local;
- list_add_tail(&entry->node, head);
- }
- return entry;
-}
-
-/**
- * dccp_feat_push_change - Add/overwrite a Change option in the list
- * @fn_list: feature-negotiation list to update
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @mandatory: whether to use Mandatory feature negotiation options
- * @fval: pointer to NN/SP value to be inserted (will be copied)
- */
-static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
- u8 mandatory, dccp_feat_val *fval)
-{
- struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
-
- if (new == NULL)
- return -ENOMEM;
-
- new->feat_num = feat;
- new->is_local = local;
- new->state = FEAT_INITIALISING;
- new->needs_confirm = false;
- new->empty_confirm = false;
- new->val = *fval;
- new->needs_mandatory = mandatory;
-
- return 0;
-}
-
-/**
- * dccp_feat_push_confirm - Add a Confirm entry to the FN list
- * @fn_list: feature-negotiation list to add to
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is being confirmed
- * @fval: pointer to NN/SP value to be inserted or NULL
- *
- * Returns 0 on success, a Reset code for further processing otherwise.
- */
-static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
- dccp_feat_val *fval)
-{
- struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
-
- if (new == NULL)
- return DCCP_RESET_CODE_TOO_BUSY;
-
- new->feat_num = feat;
- new->is_local = local;
- new->state = FEAT_STABLE; /* transition in 6.6.2 */
- new->needs_confirm = true;
- new->empty_confirm = (fval == NULL);
- new->val.nn = 0; /* zeroes the whole structure */
- if (!new->empty_confirm)
- new->val = *fval;
- new->needs_mandatory = false;
-
- return 0;
-}
-
-static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
-{
- return dccp_feat_push_confirm(fn_list, feat, local, NULL);
-}
-
-static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
-{
- list_del(&entry->node);
- dccp_feat_entry_destructor(entry);
-}
-
-void dccp_feat_list_purge(struct list_head *fn_list)
-{
- struct dccp_feat_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, fn_list, node)
- dccp_feat_entry_destructor(entry);
- INIT_LIST_HEAD(fn_list);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
-
-/* generate @to as full clone of @from - @to must not contain any nodes */
-int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
-{
- struct dccp_feat_entry *entry, *new;
-
- INIT_LIST_HEAD(to);
- list_for_each_entry(entry, from, node) {
- new = dccp_feat_clone_entry(entry);
- if (new == NULL)
- goto cloning_failed;
- list_add_tail(&new->node, to);
- }
- return 0;
-
-cloning_failed:
- dccp_feat_list_purge(to);
- return -ENOMEM;
-}
-
-/**
- * dccp_feat_valid_nn_length - Enforce length constraints on NN options
- * @feat_num: feature to return length of, one of %dccp_feature_numbers
- *
- * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
- * incoming options are accepted as long as their values are valid.
- */
-static u8 dccp_feat_valid_nn_length(u8 feat_num)
-{
- if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
- return 2;
- if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
- return 6;
- return 0;
-}
-
-static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
-{
- switch (feat_num) {
- case DCCPF_ACK_RATIO:
- return val <= DCCPF_ACK_RATIO_MAX;
- case DCCPF_SEQUENCE_WINDOW:
- return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
- }
- return 0; /* feature unknown - so we can't tell */
-}
-
-/* check that SP values are within the ranges defined in RFC 4340 */
-static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
-{
- switch (feat_num) {
- case DCCPF_CCID:
- return val == DCCPC_CCID2 || val == DCCPC_CCID3;
- /* Type-check Boolean feature values: */
- case DCCPF_SHORT_SEQNOS:
- case DCCPF_ECN_INCAPABLE:
- case DCCPF_SEND_ACK_VECTOR:
- case DCCPF_SEND_NDP_COUNT:
- case DCCPF_DATA_CHECKSUM:
- case DCCPF_SEND_LEV_RATE:
- return val < 2;
- case DCCPF_MIN_CSUM_COVER:
- return val < 16;
- }
- return 0; /* feature unknown */
-}
-
-static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
-{
- if (sp_list == NULL || sp_len < 1)
- return 0;
- while (sp_len--)
- if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
- return 0;
- return 1;
-}
-
-/**
- * dccp_feat_insert_opts - Generate FN options from current list state
- * @skb: next sk_buff to be sent to the peer
- * @dp: for client during handshake and general negotiation
- * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
- */
-int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
- struct dccp_feat_entry *pos, *next;
- u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
- bool rpt;
-
- /* put entries into @skb in the order they appear in the list */
- list_for_each_entry_safe_reverse(pos, next, fn, node) {
- opt = dccp_feat_genopt(pos);
- type = dccp_feat_type(pos->feat_num);
- rpt = false;
-
- if (pos->empty_confirm) {
- len = 0;
- ptr = NULL;
- } else {
- if (type == FEAT_SP) {
- len = pos->val.sp.len;
- ptr = pos->val.sp.vec;
- rpt = pos->needs_confirm;
- } else if (type == FEAT_NN) {
- len = dccp_feat_valid_nn_length(pos->feat_num);
- ptr = nn_in_nbo;
- dccp_encode_value_var(pos->val.nn, ptr, len);
- } else {
- DCCP_BUG("unknown feature %u", pos->feat_num);
- return -1;
- }
- }
- dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
-
- if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
- return -1;
- if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
- return -1;
-
- if (skb->sk->sk_state == DCCP_OPEN &&
- (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
- /*
- * Confirms don't get retransmitted (6.6.3) once the
- * connection is in state OPEN
- */
- dccp_feat_list_pop(pos);
- } else {
- /*
- * Enter CHANGING after transmitting the Change
- * option (6.6.2).
- */
- if (pos->state == FEAT_INITIALISING)
- pos->state = FEAT_CHANGING;
- }
- }
- return 0;
-}
-
-/**
- * __feat_register_nn - Register new NN value on socket
- * @fn: feature-negotiation list to register with
- * @feat: an NN feature from %dccp_feature_numbers
- * @mandatory: use Mandatory option if 1
- * @nn_val: value to register (restricted to 4 bytes)
- *
- * Note that NN features are local by definition (RFC 4340, 6.3.2).
- */
-static int __feat_register_nn(struct list_head *fn, u8 feat,
- u8 mandatory, u64 nn_val)
-{
- dccp_feat_val fval = { .nn = nn_val };
-
- if (dccp_feat_type(feat) != FEAT_NN ||
- !dccp_feat_is_valid_nn_val(feat, nn_val))
- return -EINVAL;
-
- /* Don't bother with default values, they will be activated anyway. */
- if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
- return 0;
-
- return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
-}
-
-/**
- * __feat_register_sp - Register new SP value/list on socket
- * @fn: feature-negotiation list to register with
- * @feat: an SP feature from %dccp_feature_numbers
- * @is_local: whether the local (1) or the remote (0) @feat is meant
- * @mandatory: use Mandatory option if 1
- * @sp_val: SP value followed by optional preference list
- * @sp_len: length of @sp_val in bytes
- */
-static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
- u8 mandatory, u8 const *sp_val, u8 sp_len)
-{
- dccp_feat_val fval;
-
- if (dccp_feat_type(feat) != FEAT_SP ||
- !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
- return -EINVAL;
-
- /* Avoid negotiating alien CCIDs by only advertising supported ones */
- if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
- return -EOPNOTSUPP;
-
- if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
- return -ENOMEM;
-
- if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) {
- kfree(fval.sp.vec);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/**
- * dccp_feat_register_sp - Register requests to change SP feature values
- * @sk: client or listening socket
- * @feat: one of %dccp_feature_numbers
- * @is_local: whether the local (1) or remote (0) @feat is meant
- * @list: array of preferred values, in descending order of preference
- * @len: length of @list in bytes
- */
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
- u8 const *list, u8 len)
-{ /* any changes must be registered before establishing the connection */
- if (sk->sk_state != DCCP_CLOSED)
- return -EISCONN;
- if (dccp_feat_type(feat) != FEAT_SP)
- return -EINVAL;
- return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
- 0, list, len);
-}
-
-/**
- * dccp_feat_nn_get - Query current/pending value of NN feature
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- *
- * For a known NN feature, returns value currently being negotiated, or
- * current (confirmed) value if no negotiation is going on.
- */
-u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
-{
- if (dccp_feat_type(feat) == FEAT_NN) {
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_feat_entry *entry;
-
- entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
- if (entry != NULL)
- return entry->val.nn;
-
- switch (feat) {
- case DCCPF_ACK_RATIO:
- return dp->dccps_l_ack_ratio;
- case DCCPF_SEQUENCE_WINDOW:
- return dp->dccps_l_seq_win;
- }
- }
- DCCP_BUG("attempt to look up unsupported feature %u", feat);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
-
-/**
- * dccp_feat_signal_nn_change - Update NN values for an established connection
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- * @nn_val: the new value to use
- *
- * This function is used to communicate NN updates out-of-band.
- */
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- dccp_feat_val fval = { .nn = nn_val };
- struct dccp_feat_entry *entry;
-
- if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
- return 0;
-
- if (dccp_feat_type(feat) != FEAT_NN ||
- !dccp_feat_is_valid_nn_val(feat, nn_val))
- return -EINVAL;
-
- if (nn_val == dccp_feat_nn_get(sk, feat))
- return 0; /* already set or negotiation under way */
-
- entry = dccp_feat_list_lookup(fn, feat, 1);
- if (entry != NULL) {
- dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
- (unsigned long long)entry->val.nn,
- (unsigned long long)nn_val);
- dccp_feat_list_pop(entry);
- }
-
- inet_csk_schedule_ack(sk);
- return dccp_feat_push_change(fn, feat, 1, 0, &fval);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
-
-/*
- * Tracking features whose value depend on the choice of CCID
- *
- * This is designed with an extension in mind so that a list walk could be done
- * before activating any features. However, the existing framework was found to
- * work satisfactorily up until now, the automatic verification is left open.
- * When adding new CCIDs, add a corresponding dependency table here.
- */
-static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
-{
- static const struct ccid_dependency ccid2_dependencies[2][2] = {
- /*
- * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
- * feature and Send Ack Vector is an RX feature, `is_local'
- * needs to be reversed.
- */
- { /* Dependencies of the receiver-side (remote) CCID2 */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = true,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 }
- },
- { /* Dependencies of the sender-side (local) CCID2 */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 }
- }
- };
- static const struct ccid_dependency ccid3_dependencies[2][5] = {
- { /*
- * Dependencies of the receiver-side CCID3
- */
- { /* locally disable Ack Vectors */
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = true,
- .is_mandatory = false,
- .val = 0
- },
- { /* see below why Send Loss Event Rate is on */
- .dependent_feat = DCCPF_SEND_LEV_RATE,
- .is_local = true,
- .is_mandatory = true,
- .val = 1
- },
- { /* NDP Count is needed as per RFC 4342, 6.1.1 */
- .dependent_feat = DCCPF_SEND_NDP_COUNT,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 },
- },
- { /*
- * CCID3 at the TX side: we request that the HC-receiver
- * will not send Ack Vectors (they will be ignored, so
- * Mandatory is not set); we enable Send Loss Event Rate
- * (Mandatory since the implementation does not support
- * the Loss Intervals option of RFC 4342, 8.6).
- * The last two options are for peer's information only.
- */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = false,
- .is_mandatory = false,
- .val = 0
- },
- {
- .dependent_feat = DCCPF_SEND_LEV_RATE,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { /* this CCID does not support Ack Ratio */
- .dependent_feat = DCCPF_ACK_RATIO,
- .is_local = true,
- .is_mandatory = false,
- .val = 0
- },
- { /* tell receiver we are sending NDP counts */
- .dependent_feat = DCCPF_SEND_NDP_COUNT,
- .is_local = true,
- .is_mandatory = false,
- .val = 1
- },
- { 0, 0, 0, 0 }
- }
- };
- switch (ccid) {
- case DCCPC_CCID2:
- return ccid2_dependencies[is_local];
- case DCCPC_CCID3:
- return ccid3_dependencies[is_local];
- default:
- return NULL;
- }
-}
-
-/**
- * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
- * @fn: feature-negotiation list to update
- * @id: CCID number to track
- * @is_local: whether TX CCID (1) or RX CCID (0) is meant
- *
- * This function needs to be called after registering all other features.
- */
-static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
-{
- const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
- int i, rc = (table == NULL);
-
- for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
- if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
- rc = __feat_register_sp(fn, table[i].dependent_feat,
- table[i].is_local,
- table[i].is_mandatory,
- &table[i].val, 1);
- else
- rc = __feat_register_nn(fn, table[i].dependent_feat,
- table[i].is_mandatory,
- table[i].val);
- return rc;
-}
-
-/**
- * dccp_feat_finalise_settings - Finalise settings before starting negotiation
- * @dp: client or listening socket (settings will be inherited)
- *
- * This is called after all registrations (socket initialisation, sysctls, and
- * sockopt calls), and before sending the first packet containing Change options
- * (ie. client-Request or server-Response), to ensure internal consistency.
- */
-int dccp_feat_finalise_settings(struct dccp_sock *dp)
-{
- struct list_head *fn = &dp->dccps_featneg;
- struct dccp_feat_entry *entry;
- int i = 2, ccids[2] = { -1, -1 };
-
- /*
- * Propagating CCIDs:
- * 1) not useful to propagate CCID settings if this host advertises more
- * than one CCID: the choice of CCID may still change - if this is
- * the client, or if this is the server and the client sends
- * singleton CCID values.
- * 2) since is that propagate_ccid changes the list, we defer changing
- * the sorted list until after the traversal.
- */
- list_for_each_entry(entry, fn, node)
- if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
- ccids[entry->is_local] = entry->val.sp.vec[0];
- while (i--)
- if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
- return -1;
- dccp_feat_print_fnlist(fn);
- return 0;
-}
-
-/**
- * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
- * @dreq: server socket to resolve
- *
- * It is the server which resolves the dependencies once the CCID has been
- * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
- */
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
-{
- struct list_head *fn = &dreq->dreq_featneg;
- struct dccp_feat_entry *entry;
- u8 is_local, ccid;
-
- for (is_local = 0; is_local <= 1; is_local++) {
- entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
-
- if (entry != NULL && !entry->empty_confirm)
- ccid = entry->val.sp.vec[0];
- else
- ccid = dccp_feat_default_value(DCCPF_CCID);
-
- if (dccp_feat_propagate_ccid(fn, ccid, is_local))
- return -1;
- }
- return 0;
-}
-
-/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
-static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
-{
- u8 c, s;
-
- for (s = 0; s < slen; s++)
- for (c = 0; c < clen; c++)
- if (servlist[s] == clilist[c])
- return servlist[s];
- return -1;
-}
-
-/**
- * dccp_feat_prefer - Move preferred entry to the start of array
- * @preferred_value: entry to move to start of array
- * @array: array of preferred entries
- * @array_len: size of the array
- *
- * Reorder the @array_len elements in @array so that @preferred_value comes
- * first. Returns >0 to indicate that @preferred_value does occur in @array.
- */
-static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
-{
- u8 i, does_occur = 0;
-
- if (array != NULL) {
- for (i = 0; i < array_len; i++)
- if (array[i] == preferred_value) {
- array[i] = array[0];
- does_occur++;
- }
- if (does_occur)
- array[0] = preferred_value;
- }
- return does_occur;
-}
-
-/**
- * dccp_feat_reconcile - Reconcile SP preference lists
- * @fv: SP list to reconcile into
- * @arr: received SP preference list
- * @len: length of @arr in bytes
- * @is_server: whether this side is the server (and @fv is the server's list)
- * @reorder: whether to reorder the list in @fv after reconciling with @arr
- * When successful, > 0 is returned and the reconciled list is in @fval.
- * A value of 0 means that negotiation failed (no shared entry).
- */
-static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
- bool is_server, bool reorder)
-{
- int rc;
-
- if (!fv->sp.vec || !arr) {
- DCCP_CRIT("NULL feature value or array");
- return 0;
- }
-
- if (is_server)
- rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
- else
- rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
-
- if (!reorder)
- return rc;
- if (rc < 0)
- return 0;
-
- /*
- * Reorder list: used for activating features and in dccp_insert_fn_opt.
- */
- return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
-}
-
-/**
- * dccp_feat_change_recv - Process incoming ChangeL/R options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether the Change was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is the server (1) or the client (0)
- */
-static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
- u8 feat, u8 *val, u8 len, const bool server)
-{
- u8 defval, type = dccp_feat_type(feat);
- const bool local = (opt == DCCPO_CHANGE_R);
- struct dccp_feat_entry *entry;
- dccp_feat_val fval;
-
- if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
- goto unknown_feature_or_value;
-
- dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
- /*
- * Negotiation of NN features: Change R is invalid, so there is no
- * simultaneous negotiation; hence we do not look up in the list.
- */
- if (type == FEAT_NN) {
- if (local || len > sizeof(fval.nn))
- goto unknown_feature_or_value;
-
- /* 6.3.2: "The feature remote MUST accept any valid value..." */
- fval.nn = dccp_decode_value_var(val, len);
- if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
- goto unknown_feature_or_value;
-
- return dccp_feat_push_confirm(fn, feat, local, &fval);
- }
-
- /*
- * Unidirectional/simultaneous negotiation of SP features (6.3.1)
- */
- entry = dccp_feat_list_lookup(fn, feat, local);
- if (entry == NULL) {
- /*
- * No particular preferences have been registered. We deal with
- * this situation by assuming that all valid values are equally
- * acceptable, and apply the following checks:
- * - if the peer's list is a singleton, we accept a valid value;
- * - if we are the server, we first try to see if the peer (the
- * client) advertises the default value. If yes, we use it,
- * otherwise we accept the preferred value;
- * - else if we are the client, we use the first list element.
- */
- if (dccp_feat_clone_sp_val(&fval, val, 1))
- return DCCP_RESET_CODE_TOO_BUSY;
-
- if (len > 1 && server) {
- defval = dccp_feat_default_value(feat);
- if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
- fval.sp.vec[0] = defval;
- } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
- kfree(fval.sp.vec);
- goto unknown_feature_or_value;
- }
-
- /* Treat unsupported CCIDs like invalid values */
- if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
- kfree(fval.sp.vec);
- goto not_valid_or_not_known;
- }
-
- if (dccp_feat_push_confirm(fn, feat, local, &fval)) {
- kfree(fval.sp.vec);
- return DCCP_RESET_CODE_TOO_BUSY;
- }
-
- return 0;
- } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
- return 0;
- }
-
- if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
- entry->empty_confirm = false;
- } else if (is_mandatory) {
- return DCCP_RESET_CODE_MANDATORY_ERROR;
- } else if (entry->state == FEAT_INITIALISING) {
- /*
- * Failed simultaneous negotiation (server only): try to `save'
- * the connection by checking whether entry contains the default
- * value for @feat. If yes, send an empty Confirm to signal that
- * the received Change was not understood - which implies using
- * the default value.
- * If this also fails, we use Reset as the last resort.
- */
- WARN_ON(!server);
- defval = dccp_feat_default_value(feat);
- if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
- return DCCP_RESET_CODE_OPTION_ERROR;
- entry->empty_confirm = true;
- }
- entry->needs_confirm = true;
- entry->needs_mandatory = false;
- entry->state = FEAT_STABLE;
- return 0;
-
-unknown_feature_or_value:
- if (!is_mandatory)
- return dccp_push_empty_confirm(fn, feat, local);
-
-not_valid_or_not_known:
- return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_confirm_recv - Process received Confirm options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is server (1) or client (0)
- */
-static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
- u8 feat, u8 *val, u8 len, const bool server)
-{
- u8 *plist, plen, type = dccp_feat_type(feat);
- const bool local = (opt == DCCPO_CONFIRM_R);
- struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
-
- dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
- if (entry == NULL) { /* nothing queued: ignore or handle error */
- if (is_mandatory && type == FEAT_UNKNOWN)
- return DCCP_RESET_CODE_MANDATORY_ERROR;
-
- if (!local && type == FEAT_NN) /* 6.3.2 */
- goto confirmation_failed;
- return 0;
- }
-
- if (entry->state != FEAT_CHANGING) /* 6.6.2 */
- return 0;
-
- if (len == 0) {
- if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
- goto confirmation_failed;
- /*
- * Empty Confirm during connection setup: this means reverting
- * to the `old' value, which in this case is the default. Since
- * we handle default values automatically when no other values
- * have been set, we revert to the old value by removing this
- * entry from the list.
- */
- dccp_feat_list_pop(entry);
- return 0;
- }
-
- if (type == FEAT_NN) {
- if (len > sizeof(entry->val.nn))
- goto confirmation_failed;
-
- if (entry->val.nn == dccp_decode_value_var(val, len))
- goto confirmation_succeeded;
-
- DCCP_WARN("Bogus Confirm for non-existing value\n");
- goto confirmation_failed;
- }
-
- /*
- * Parsing SP Confirms: the first element of @val is the preferred
- * SP value which the peer confirms, the remainder depends on @len.
- * Note that only the confirmed value need to be a valid SP value.
- */
- if (!dccp_feat_is_valid_sp_val(feat, *val))
- goto confirmation_failed;
-
- if (len == 1) { /* peer didn't supply a preference list */
- plist = val;
- plen = len;
- } else { /* preferred value + preference list */
- plist = val + 1;
- plen = len - 1;
- }
-
- /* Check whether the peer got the reconciliation right (6.6.8) */
- if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
- DCCP_WARN("Confirm selected the wrong value %u\n", *val);
- return DCCP_RESET_CODE_OPTION_ERROR;
- }
- entry->val.sp.vec[0] = *val;
-
-confirmation_succeeded:
- entry->state = FEAT_STABLE;
- return 0;
-
-confirmation_failed:
- DCCP_WARN("Confirmation failed\n");
- return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_handle_nn_established - Fast-path reception of NN options
- * @sk: socket of an established DCCP connection
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
- * @feat: NN number, one of %dccp_feature_numbers
- * @val: NN value
- * @len: length of @val in bytes
- *
- * This function combines the functionality of change_recv/confirm_recv, with
- * the following differences (reset codes are the same):
- * - cleanup after receiving the Confirm;
- * - values are directly activated after successful parsing;
- * - deliberately restricted to NN features.
- * The restriction to NN features is essential since SP features can have non-
- * predictable outcomes (depending on the remote configuration), and are inter-
- * dependent (CCIDs for instance cause further dependencies).
- */
-static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
- u8 feat, u8 *val, u8 len)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- const bool local = (opt == DCCPO_CONFIRM_R);
- struct dccp_feat_entry *entry;
- u8 type = dccp_feat_type(feat);
- dccp_feat_val fval;
-
- dccp_feat_print_opt(opt, feat, val, len, mandatory);
-
- /* Ignore non-mandatory unknown and non-NN features */
- if (type == FEAT_UNKNOWN) {
- if (local && !mandatory)
- return 0;
- goto fast_path_unknown;
- } else if (type != FEAT_NN) {
- return 0;
- }
-
- /*
- * We don't accept empty Confirms, since in fast-path feature
- * negotiation the values are enabled immediately after sending
- * the Change option.
- * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
- */
- if (len == 0 || len > sizeof(fval.nn))
- goto fast_path_unknown;
-
- if (opt == DCCPO_CHANGE_L) {
- fval.nn = dccp_decode_value_var(val, len);
- if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
- goto fast_path_unknown;
-
- if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
- dccp_feat_activate(sk, feat, local, &fval))
- return DCCP_RESET_CODE_TOO_BUSY;
-
- /* set the `Ack Pending' flag to piggyback a Confirm */
- inet_csk_schedule_ack(sk);
-
- } else if (opt == DCCPO_CONFIRM_R) {
- entry = dccp_feat_list_lookup(fn, feat, local);
- if (entry == NULL || entry->state != FEAT_CHANGING)
- return 0;
-
- fval.nn = dccp_decode_value_var(val, len);
- /*
- * Just ignore a value that doesn't match our current value.
- * If the option changes twice within two RTTs, then at least
- * one CONFIRM will be received for the old value after a
- * new CHANGE was sent.
- */
- if (fval.nn != entry->val.nn)
- return 0;
-
- /* Only activate after receiving the Confirm option (6.6.1). */
- dccp_feat_activate(sk, feat, local, &fval);
-
- /* It has been confirmed - so remove the entry */
- dccp_feat_list_pop(entry);
-
- } else {
- DCCP_WARN("Received illegal option %u\n", opt);
- goto fast_path_failed;
- }
- return 0;
-
-fast_path_unknown:
- if (!mandatory)
- return dccp_push_empty_confirm(fn, feat, local);
-
-fast_path_failed:
- return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_parse_options - Process Feature-Negotiation Options
- * @sk: for general use and used by the client during connection setup
- * @dreq: used by the server during connection setup
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: value contents of @opt
- * @len: length of @val in bytes
- *
- * Returns 0 on success, a Reset code for ending the connection otherwise.
- */
-int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
- u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
- bool server = false;
-
- switch (sk->sk_state) {
- /*
- * Negotiation during connection setup
- */
- case DCCP_LISTEN:
- server = true;
- fallthrough;
- case DCCP_REQUESTING:
- switch (opt) {
- case DCCPO_CHANGE_L:
- case DCCPO_CHANGE_R:
- return dccp_feat_change_recv(fn, mandatory, opt, feat,
- val, len, server);
- case DCCPO_CONFIRM_R:
- case DCCPO_CONFIRM_L:
- return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
- val, len, server);
- }
- break;
- /*
- * Support for exchanging NN options on an established connection.
- */
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
- val, len);
- }
- return 0; /* ignore FN options in all other states */
-}
-
-/**
- * dccp_feat_init - Seed feature negotiation with host-specific defaults
- * @sk: Socket to initialize.
- *
- * This initialises global defaults, depending on the value of the sysctls.
- * These can later be overridden by registering changes via setsockopt calls.
- * The last link in the chain is finalise_settings, to make sure that between
- * here and the start of actual feature negotiation no inconsistencies enter.
- *
- * All features not appearing below use either defaults or are otherwise
- * later adjusted through dccp_feat_finalise_settings().
- */
-int dccp_feat_init(struct sock *sk)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- u8 on = 1, off = 0;
- int rc;
- struct {
- u8 *val;
- u8 len;
- } tx, rx;
-
- /* Non-negotiable (NN) features */
- rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
- sysctl_dccp_sequence_window);
- if (rc)
- return rc;
-
- /* Server-priority (SP) features */
-
- /* Advertise that short seqnos are not supported (7.6.1) */
- rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
- if (rc)
- return rc;
-
- /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
- rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
- if (rc)
- return rc;
-
- /*
- * We advertise the available list of CCIDs and reorder according to
- * preferences, to avoid failure resulting from negotiating different
- * singleton values (which always leads to failure).
- * These settings can still (later) be overridden via sockopts.
- */
- if (ccid_get_builtin_ccids(&tx.val, &tx.len))
- return -ENOBUFS;
- if (ccid_get_builtin_ccids(&rx.val, &rx.len)) {
- kfree(tx.val);
- return -ENOBUFS;
- }
-
- if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
- !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
- goto free_ccid_lists;
-
- rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
- if (rc)
- goto free_ccid_lists;
-
- rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
-
-free_ccid_lists:
- kfree(tx.val);
- kfree(rx.val);
- return rc;
-}
-
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_feat_entry *cur, *next;
- int idx;
- dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
- [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
- };
-
- list_for_each_entry(cur, fn_list, node) {
- /*
- * An empty Confirm means that either an unknown feature type
- * or an invalid value was present. In the first case there is
- * nothing to activate, in the other the default value is used.
- */
- if (cur->empty_confirm)
- continue;
-
- idx = dccp_feat_index(cur->feat_num);
- if (idx < 0) {
- DCCP_BUG("Unknown feature %u", cur->feat_num);
- goto activation_failed;
- }
- if (cur->state != FEAT_STABLE) {
- DCCP_CRIT("Negotiation of %s %s failed in state %s",
- cur->is_local ? "local" : "remote",
- dccp_feat_fname(cur->feat_num),
- dccp_feat_sname[cur->state]);
- goto activation_failed;
- }
- fvals[idx][cur->is_local] = &cur->val;
- }
-
- /*
- * Activate in decreasing order of index, so that the CCIDs are always
- * activated as the last feature. This avoids the case where a CCID
- * relies on the initialisation of one or more features that it depends
- * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
- */
- for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
- if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
- __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
- DCCP_CRIT("Could not activate %d", idx);
- goto activation_failed;
- }
-
- /* Clean up Change options which have been confirmed already */
- list_for_each_entry_safe(cur, next, fn_list, node)
- if (!cur->needs_confirm)
- dccp_feat_list_pop(cur);
-
- dccp_pr_debug("Activation OK\n");
- return 0;
-
-activation_failed:
- /*
- * We clean up everything that may have been allocated, since
- * it is difficult to track at which stage negotiation failed.
- * This is ok, since all allocation functions below are robust
- * against NULL arguments.
- */
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- return -1;
-}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
deleted file mode 100644
index 57d9c026aa3f..000000000000
--- a/net/dccp/feat.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _DCCP_FEAT_H
-#define _DCCP_FEAT_H
-/*
- * net/dccp/feat.h
- *
- * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- */
-#include <linux/types.h>
-#include "dccp.h"
-
-/*
- * Known limit values
- */
-/* Ack Ratio takes 2-byte integer values (11.3) */
-#define DCCPF_ACK_RATIO_MAX 0xFFFF
-/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
-#define DCCPF_SEQ_WMIN 32
-#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
-/* Maximum number of SP values that fit in a single (Confirm) option */
-#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
-
-enum dccp_feat_type {
- FEAT_AT_RX = 1, /* located at RX side of half-connection */
- FEAT_AT_TX = 2, /* located at TX side of half-connection */
- FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
- FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
- FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
-};
-
-enum dccp_feat_state {
- FEAT_DEFAULT = 0, /* using default values from 6.4 */
- FEAT_INITIALISING, /* feature is being initialised */
- FEAT_CHANGING, /* Change sent but not confirmed yet */
- FEAT_UNSTABLE, /* local modification in state CHANGING */
- FEAT_STABLE /* both ends (think they) agree */
-};
-
-/**
- * dccp_feat_val - Container for SP or NN feature values
- * @nn: single NN value
- * @sp.vec: single SP value plus optional preference list
- * @sp.len: length of @sp.vec in bytes
- */
-typedef union {
- u64 nn;
- struct {
- u8 *vec;
- u8 len;
- } sp;
-} dccp_feat_val;
-
-/**
- * struct feat_entry - Data structure to perform feature negotiation
- * @val: feature's current value (SP features may have preference list)
- * @state: feature's current state
- * @feat_num: one of %dccp_feature_numbers
- * @needs_mandatory: whether Mandatory options should be sent
- * @needs_confirm: whether to send a Confirm instead of a Change
- * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
- * @is_local: feature location (1) or feature-remote (0)
- * @node: list pointers, entries arranged in FIFO order
- */
-struct dccp_feat_entry {
- dccp_feat_val val;
- enum dccp_feat_state state:8;
- u8 feat_num;
-
- bool needs_mandatory,
- needs_confirm,
- empty_confirm,
- is_local;
-
- struct list_head node;
-};
-
-static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
-{
- if (entry->needs_confirm)
- return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
- return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
-}
-
-/**
- * struct ccid_dependency - Track changes resulting from choosing a CCID
- * @dependent_feat: one of %dccp_feature_numbers
- * @is_local: local (1) or remote (0) @dependent_feat
- * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
- * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
- */
-struct ccid_dependency {
- u8 dependent_feat;
- bool is_local:1,
- is_mandatory:1;
- u8 val;
-};
-
-/*
- * Sysctls to seed defaults for feature negotiation
- */
-extern unsigned long sysctl_dccp_sequence_window;
-extern int sysctl_dccp_rx_ccid;
-extern int sysctl_dccp_tx_ccid;
-
-int dccp_feat_init(struct sock *sk);
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
- u8 const *list, u8 len);
-int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
- u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-int dccp_feat_clone_list(struct list_head const *, struct list_head *);
-
-/*
- * Encoding variable-length options and their maximum length.
- *
- * This affects NN options (SP options are all u8) and other variable-length
- * options (see table 3 in RFC 4340). The limit is currently given the Sequence
- * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
- * options consume less than 6 bytes (timestamps are 4 bytes).
- * When updating this constant (e.g. due to new internet drafts / RFCs), make
- * sure that you also update all code which refers to it.
- */
-#define DCCP_OPTVAL_MAXLEN 6
-
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
-u64 dccp_decode_value_var(const u8 *bf, const u8 len);
-u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
-
-int dccp_insert_option_mandatory(struct sk_buff *skb);
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
- bool repeat_first);
-#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
deleted file mode 100644
index 2cbb757a894f..000000000000
--- a/net/dccp/input.c
+++ /dev/null
@@ -1,739 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/input.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
-int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
-
-static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
-{
- __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk);
-}
-
-static void dccp_fin(struct sock *sk, struct sk_buff *skb)
-{
- /*
- * On receiving Close/CloseReq, both RD/WR shutdown are performed.
- * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
- * receiving the closing segment, but there is no guarantee that such
- * data will be processed at all.
- */
- sk->sk_shutdown = SHUTDOWN_MASK;
- sock_set_flag(sk, SOCK_DONE);
- dccp_enqueue_skb(sk, skb);
-}
-
-static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
-
- switch (sk->sk_state) {
- /*
- * We ignore Close when received in one of the following states:
- * - CLOSED (may be a late or duplicate packet)
- * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
- * - RESPOND (already handled by dccp_check_req)
- */
- case DCCP_CLOSING:
- /*
- * Simultaneous-close: receiving a Close after sending one. This
- * can happen if both client and server perform active-close and
- * will result in an endless ping-pong of crossing and retrans-
- * mitted Close packets, which only terminates when one of the
- * nodes times out (min. 64 seconds). Quicker convergence can be
- * achieved when one of the nodes acts as tie-breaker.
- * This is ok as both ends are done with data transfer and each
- * end is just waiting for the other to acknowledge termination.
- */
- if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
- break;
- fallthrough;
- case DCCP_REQUESTING:
- case DCCP_ACTIVE_CLOSEREQ:
- dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_done(sk);
- break;
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- /* Give waiting application a chance to read pending data */
- queued = 1;
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
- fallthrough;
- case DCCP_PASSIVE_CLOSE:
- /*
- * Retransmitted Close: we have already enqueued the first one.
- */
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- }
- return queued;
-}
-
-static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
-
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == CloseReq)
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
- return queued;
- }
-
- /* Step 13: process relevant Client states < CLOSEREQ */
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- dccp_send_close(sk, 0);
- dccp_set_state(sk, DCCP_CLOSING);
- break;
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- /* Give waiting application a chance to read pending data */
- queued = 1;
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
- fallthrough;
- case DCCP_PASSIVE_CLOSEREQ:
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- }
- return queued;
-}
-
-static u16 dccp_reset_code_convert(const u8 code)
-{
- static const u16 error_code[] = {
- [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
- [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
- [DCCP_RESET_CODE_ABORTED] = ECONNRESET,
-
- [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED,
- [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
- [DCCP_RESET_CODE_TOO_BUSY] = EUSERS,
- [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
-
- [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG,
- [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR,
- [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC,
- [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ,
- [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP,
- };
-
- return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
-}
-
-static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
-{
- u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
-
- sk->sk_err = err;
-
- /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
- dccp_fin(sk, skb);
-
- if (err && !sock_flag(sk, SOCK_DEAD))
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
-}
-
-static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
-
- if (av == NULL)
- return;
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
- dccp_ackvec_input(av, skb);
-}
-
-static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- /* Don't deliver to RX CCID when node has shut down read end. */
- if (!(sk->sk_shutdown & RCV_SHUTDOWN))
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- /*
- * Until the TX queue has been drained, we can not honour SHUT_WR, since
- * we need received feedback as input to adjust congestion control.
- */
- if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-}
-
-static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- struct dccp_sock *dp = dccp_sk(sk);
- u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
- ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
-
- /*
- * Step 5: Prepare sequence numbers for Sync
- * If P.type == Sync or P.type == SyncAck,
- * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
- * / * P is valid, so update sequence number variables
- * accordingly. After this update, P will pass the tests
- * in Step 6. A SyncAck is generated if necessary in
- * Step 15 * /
- * Update S.GSR, S.SWL, S.SWH
- * Otherwise,
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_SYNC ||
- dh->dccph_type == DCCP_PKT_SYNCACK) {
- if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
- dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
- dccp_update_gsr(sk, seqno);
- else
- return -1;
- }
-
- /*
- * Step 6: Check sequence numbers
- * Let LSWL = S.SWL and LAWL = S.AWL
- * If P.type == CloseReq or P.type == Close or P.type == Reset,
- * LSWL := S.GSR + 1, LAWL := S.GAR
- * If LSWL <= P.seqno <= S.SWH
- * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
- * Update S.GSR, S.SWL, S.SWH
- * If P.type != Sync,
- * Update S.GAR
- */
- lswl = dp->dccps_swl;
- lawl = dp->dccps_awl;
-
- if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
- dh->dccph_type == DCCP_PKT_CLOSE ||
- dh->dccph_type == DCCP_PKT_RESET) {
- lswl = ADD48(dp->dccps_gsr, 1);
- lawl = dp->dccps_gar;
- }
-
- if (between48(seqno, lswl, dp->dccps_swh) &&
- (ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
- between48(ackno, lawl, dp->dccps_awh))) {
- dccp_update_gsr(sk, seqno);
-
- if (dh->dccph_type != DCCP_PKT_SYNC &&
- ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
- after48(ackno, dp->dccps_gar))
- dp->dccps_gar = ackno;
- } else {
- unsigned long now = jiffies;
- /*
- * Step 6: Check sequence numbers
- * Otherwise,
- * If P.type == Reset,
- * Send Sync packet acknowledging S.GSR
- * Otherwise,
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- *
- * These Syncs are rate-limited as per RFC 4340, 7.5.4:
- * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
- */
- if (time_before(now, (dp->dccps_rate_last +
- sysctl_dccp_sync_ratelimit)))
- return -1;
-
- DCCP_WARN("Step 6 failed for %s packet, "
- "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
- "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
- "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
- (unsigned long long) lswl, (unsigned long long) seqno,
- (unsigned long long) dp->dccps_swh,
- (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
- : "exists",
- (unsigned long long) lawl, (unsigned long long) ackno,
- (unsigned long long) dp->dccps_awh);
-
- dp->dccps_rate_last = now;
-
- if (dh->dccph_type == DCCP_PKT_RESET)
- seqno = dp->dccps_gsr;
- dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
- return -1;
- }
-
- return 0;
-}
-
-static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- switch (dccp_hdr(skb)->dccph_type) {
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_DATA:
- /*
- * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
- * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
- * - sk_receive_queue is full, use Code 2, "Receive Buffer"
- */
- dccp_enqueue_skb(sk, skb);
- return 0;
- case DCCP_PKT_ACK:
- goto discard;
- case DCCP_PKT_RESET:
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- dccp_rcv_reset(sk, skb);
- return 0;
- case DCCP_PKT_CLOSEREQ:
- if (dccp_rcv_closereq(sk, skb))
- return 0;
- goto discard;
- case DCCP_PKT_CLOSE:
- if (dccp_rcv_close(sk, skb))
- return 0;
- goto discard;
- case DCCP_PKT_REQUEST:
- /* Step 7
- * or (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state >= OPEN and P.type == Request
- * and P.seqno >= S.OSR)
- * or (S.state >= OPEN and P.type == Response
- * and P.seqno >= S.OSR)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dp->dccps_role != DCCP_ROLE_LISTEN)
- goto send_sync;
- goto check_seq;
- case DCCP_PKT_RESPONSE:
- if (dp->dccps_role != DCCP_ROLE_CLIENT)
- goto send_sync;
-check_seq:
- if (dccp_delta_seqno(dp->dccps_osr,
- DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
-send_sync:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNC);
- }
- break;
- case DCCP_PKT_SYNC:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNCACK);
- /*
- * From RFC 4340, sec. 5.7
- *
- * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
- * MAY have non-zero-length application data areas, whose
- * contents receivers MUST ignore.
- */
- goto discard;
- }
-
- DCCP_INC_STATS(DCCP_MIB_INERRS);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len)
-{
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- dccp_handle_ackvec_processing(sk, skb);
- dccp_deliver_input_to_ccids(sk, skb);
-
- return __dccp_rcv_established(sk, skb, dh, len);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_established);
-
-static int dccp_rcv_request_sent_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned int len)
-{
- /*
- * Step 4: Prepare sequence numbers in REQUEST
- * If S.state == REQUEST,
- * If (P.type == Response or P.type == Reset)
- * and S.AWL <= P.ackno <= S.AWH,
- * / * Set sequence number variables corresponding to the
- * other endpoint, so P will pass the tests in Step 6 * /
- * Set S.GSR, S.ISR, S.SWL, S.SWH
- * / * Response processing continues in Step 10; Reset
- * processing continues in Step 9 * /
- */
- if (dh->dccph_type == DCCP_PKT_RESPONSE) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- long tstamp = dccp_timestamp();
-
- if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dp->dccps_awl, dp->dccps_awh)) {
- dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu\n",
- (unsigned long long)dp->dccps_awl,
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long)dp->dccps_awh);
- goto out_invalid_packet;
- }
-
- /*
- * If option processing (Step 8) failed, return 1 here so that
- * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
- * the option type and is set in dccp_parse_options().
- */
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
- if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
- dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
- dp->dccps_options_received.dccpor_timestamp_echo));
-
- /* Stop the REQUEST timer */
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- WARN_ON(sk->sk_send_head == NULL);
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
-
- /*
- * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
- * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
- * is done as part of activating the feature values below, since
- * these settings depend on the local/remote Sequence Window
- * features, which were undefined or not confirmed until now.
- */
- dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
-
- dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-
- /*
- * Step 10: Process REQUEST state (second part)
- * If S.state == REQUEST,
- * / * If we get here, P is a valid Response from the
- * server (see Step 4), and we should move to
- * PARTOPEN state. PARTOPEN means send an Ack,
- * don't send Data packets, retransmit Acks
- * periodically, and always include any Init Cookie
- * from the Response * /
- * S.state := PARTOPEN
- * Set PARTOPEN timer
- * Continue with S.state == PARTOPEN
- * / * Step 12 will send the Ack completing the
- * three-way handshake * /
- */
- dccp_set_state(sk, DCCP_PARTOPEN);
-
- /*
- * If feature negotiation was successful, activate features now;
- * an activation failure means that this host could not activate
- * one ore more features (e.g. insufficient memory), which would
- * leave at least one feature in an undefined state.
- */
- if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
- goto unable_to_proceed;
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
-
- if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) ||
- icsk->icsk_accept_queue.rskq_defer_accept) {
- /* Save one ACK. Data will be ready after
- * several ticks, if write_pending is set.
- *
- * It may be deleted, but with this feature tcpdumps
- * look so _wonderfully_ clever, that I was not able
- * to stand against the temptation 8) --ANK
- */
- /*
- * OK, in DCCP we can as well do a similar trick, its
- * even in the draft, but there is no need for us to
- * schedule an ack here, as dccp_sendmsg does this for
- * us, also stated in the draft. -acme
- */
- __kfree_skb(skb);
- return 0;
- }
- dccp_send_ack(sk);
- return -1;
- }
-
-out_invalid_packet:
- /* dccp_v4_do_rcv will send a reset */
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
- return 1;
-
-unable_to_proceed:
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
- /*
- * We mark this socket as no longer usable, so that the loop in
- * dccp_sendmsg() terminates and the application gets notified.
- */
- dccp_set_state(sk, DCCP_CLOSED);
- sk->sk_err = ECOMM;
- return 1;
-}
-
-static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
- int queued = 0;
-
- switch (dh->dccph_type) {
- case DCCP_PKT_RESET:
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- break;
- case DCCP_PKT_DATA:
- if (sk->sk_state == DCCP_RESPOND)
- break;
- fallthrough;
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_ACK:
- /*
- * FIXME: we should be resetting the PARTOPEN (DELACK) timer
- * here but only if we haven't used the DELACK timer for
- * something else, like sending a delayed ack for a TIMESTAMP
- * echo, etc, for now were not clearing it, sending an extra
- * ACK when there is nothing else to do in DELACK is not a big
- * deal after all.
- */
-
- /* Stop the PARTOPEN timer */
- if (sk->sk_state == DCCP_PARTOPEN)
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-
- /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
- if (likely(sample)) {
- long delta = dccp_timestamp() - sample;
-
- dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
- }
-
- dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_set_state(sk, DCCP_OPEN);
-
- if (dh->dccph_type == DCCP_PKT_DATAACK ||
- dh->dccph_type == DCCP_PKT_DATA) {
- __dccp_rcv_established(sk, skb, dh, len);
- queued = 1; /* packet was queued
- (by __dccp_rcv_established) */
- }
- break;
- }
-
- return queued;
-}
-
-int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int old_state = sk->sk_state;
- bool acceptable;
- int queued = 0;
-
- /*
- * Step 3: Process LISTEN state
- *
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
- * Cookies Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_LISTEN) {
- if (dh->dccph_type == DCCP_PKT_REQUEST) {
- /* It is possible that we process SYN packets from backlog,
- * so we need to make sure to disable BH and RCU right there.
- */
- rcu_read_lock();
- local_bh_disable();
- acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
- local_bh_enable();
- rcu_read_unlock();
- if (!acceptable)
- return 1;
- consume_skb(skb);
- return 0;
- }
- if (dh->dccph_type == DCCP_PKT_RESET)
- goto discard;
-
- /* Caller (dccp_v4_do_rcv) will send Reset */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
- } else if (sk->sk_state == DCCP_CLOSED) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
- }
-
- /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
- if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_RESPONSE) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- }
-
- /* Step 8: Process options */
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_RESET) {
- dccp_rcv_reset(sk, skb);
- return 0;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
- if (dccp_rcv_closereq(sk, skb))
- return 0;
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
- if (dccp_rcv_close(sk, skb))
- return 0;
- goto discard;
- }
-
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
- if (queued >= 0)
- return queued;
-
- __kfree_skb(skb);
- return 0;
-
- case DCCP_PARTOPEN:
- /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
- dccp_handle_ackvec_processing(sk, skb);
- dccp_deliver_input_to_ccids(sk, skb);
- fallthrough;
- case DCCP_RESPOND:
- queued = dccp_rcv_respond_partopen_state_process(sk, skb,
- dh, len);
- break;
- }
-
- if (dh->dccph_type == DCCP_PKT_ACK ||
- dh->dccph_type == DCCP_PKT_DATAACK) {
- switch (old_state) {
- case DCCP_PARTOPEN:
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- break;
- }
- } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
- goto discard;
- }
-
- if (!queued) {
-discard:
- __kfree_skb(skb);
- }
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
-
-/**
- * dccp_sample_rtt - Validate and finalise computation of RTT sample
- * @sk: socket structure
- * @delta: number of microseconds between packet and acknowledgment
- *
- * The routine is kept generic to work in different contexts. It should be
- * called immediately when the ACK used for the RTT sample arrives.
- */
-u32 dccp_sample_rtt(struct sock *sk, long delta)
-{
- /* dccpor_elapsed_time is either zeroed out or set and > 0 */
- delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
-
- if (unlikely(delta <= 0)) {
- DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
- return DCCP_SANE_RTT_MIN;
- }
- if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
- DCCP_WARN("RTT sample %ld too large, using max\n", delta);
- return DCCP_SANE_RTT_MAX;
- }
-
- return delta;
-}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
deleted file mode 100644
index 2045ddac0fe9..000000000000
--- a/net/dccp/ipv4.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/ipv4.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/icmp.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/random.h>
-
-#include <net/icmp.h>
-#include <net/inet_common.h>
-#include <net/inet_dscp.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/protocol.h>
-#include <net/sock.h>
-#include <net/timewait_sock.h>
-#include <net/tcp_states.h>
-#include <net/xfrm.h>
-#include <net/secure_seq.h>
-#include <net/netns/generic.h>
-#include <net/rstreason.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-struct dccp_v4_pernet {
- struct sock *v4_ctl_sk;
-};
-
-static unsigned int dccp_v4_pernet_id __read_mostly;
-
-/*
- * The per-net v4_ctl_sk socket is used for responding to
- * the Out-of-the-blue (OOTB) packets. A control sock will be created
- * for this socket at the initialization time.
- */
-
-int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
-{
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct inet_sock *inet = inet_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- __be16 orig_sport, orig_dport;
- __be32 daddr, nexthop;
- struct flowi4 *fl4;
- struct rtable *rt;
- int err;
- struct ip_options_rcu *inet_opt;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < sizeof(struct sockaddr_in))
- return -EINVAL;
-
- if (usin->sin_family != AF_INET)
- return -EAFNOSUPPORT;
-
- nexthop = daddr = usin->sin_addr.s_addr;
-
- inet_opt = rcu_dereference_protected(inet->inet_opt,
- lockdep_sock_is_held(sk));
- if (inet_opt != NULL && inet_opt->opt.srr) {
- if (daddr == 0)
- return -EINVAL;
- nexthop = inet_opt->opt.faddr;
- }
-
- orig_sport = inet->inet_sport;
- orig_dport = usin->sin_port;
- fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
- sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport,
- orig_dport, sk);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
-
- if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
- ip_rt_put(rt);
- return -ENETUNREACH;
- }
-
- if (inet_opt == NULL || !inet_opt->opt.srr)
- daddr = fl4->daddr;
-
- if (inet->inet_saddr == 0) {
- err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
- if (err) {
- ip_rt_put(rt);
- return err;
- }
- } else {
- sk_rcv_saddr_set(sk, inet->inet_saddr);
- }
-
- inet->inet_dport = usin->sin_port;
- sk_daddr_set(sk, daddr);
-
- inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet_opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- /*
- * Socket identity is still unknown (sport may be zero).
- * However we set state to DCCP_REQUESTING and not releasing socket
- * lock select source port, enter ourselves into the hash tables and
- * complete initialization after this.
- */
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet_hash_connect(&dccp_death_row, sk);
- if (err != 0)
- goto failure;
-
- rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
- inet->inet_sport, inet->inet_dport, sk);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
- goto failure;
- }
- /* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->dst);
-
- dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- inet->inet_dport);
- atomic_set(&inet->inet_id, get_random_u16());
-
- err = dccp_connect(sk);
- rt = NULL;
- if (err != 0)
- goto failure;
-out:
- return err;
-failure:
- /*
- * This unhashes the socket and releases the local port, if necessary.
- */
- dccp_set_state(sk, DCCP_CLOSED);
- inet_bhash2_reset_saddr(sk);
- ip_rt_put(rt);
- sk->sk_route_caps = 0;
- inet->inet_dport = 0;
- goto out;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_connect);
-
-/*
- * This routine does path mtu discovery as defined in RFC1191.
- */
-static inline void dccp_do_pmtu_discovery(struct sock *sk,
- const struct iphdr *iph,
- u32 mtu)
-{
- struct dst_entry *dst;
- const struct inet_sock *inet = inet_sk(sk);
- const struct dccp_sock *dp = dccp_sk(sk);
-
- /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
- * send out by Linux are always < 576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- dst = inet_csk_update_pmtu(sk, mtu);
- if (!dst)
- return;
-
- /* Something is about to be wrong... Remember soft error
- * for the case, if this connection will not able to recover.
- */
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
- WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
-
- mtu = dst_mtu(dst);
-
- if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- ip_sk_accept_pmtu(sk) &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- dccp_sync_mss(sk, mtu);
-
- /*
- * From RFC 4340, sec. 14.1:
- *
- * DCCP-Sync packets are the best choice for upward
- * probing, since DCCP-Sync probes do not risk application
- * data loss.
- */
- dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
- } /* else let the usual retransmit timer handle it */
-}
-
-static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
-{
- struct dst_entry *dst = __sk_dst_check(sk, 0);
-
- if (dst)
- dst->ops->redirect(dst, sk, skb);
-}
-
-void dccp_req_err(struct sock *sk, u64 seq)
- {
- struct request_sock *req = inet_reqsk(sk);
- struct net *net = sock_net(sk);
-
- /*
- * ICMPs are not backlogged, hence we cannot get an established
- * socket here.
- */
- if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- } else {
- /*
- * Still in RESPOND, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- }
- reqsk_put(req);
-}
-EXPORT_SYMBOL(dccp_req_err);
-
-/*
- * This routine is called by the ICMP module when it gets some sort of error
- * condition. If err < 0 then the socket should be closed and the error
- * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
- * After adjustment header points to the first 8 bytes of the tcp header. We
- * need to find the appropriate port.
- *
- * The locking strategy used here is very "optimistic". When someone else
- * accesses the socket the ICMP is just dropped and for some paths there is no
- * check at all. A more general error queue to queue errors for later handling
- * is probably better.
- */
-static int dccp_v4_err(struct sk_buff *skb, u32 info)
-{
- const struct iphdr *iph = (struct iphdr *)skb->data;
- const u8 offset = iph->ihl << 2;
- const struct dccp_hdr *dh;
- struct dccp_sock *dp;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct sock *sk;
- __u64 seq;
- int err;
- struct net *net = dev_net(skb->dev);
-
- if (!pskb_may_pull(skb, offset + sizeof(*dh)))
- return -EINVAL;
- dh = (struct dccp_hdr *)(skb->data + offset);
- if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh)))
- return -EINVAL;
- iph = (struct iphdr *)skb->data;
- dh = (struct dccp_hdr *)(skb->data + offset);
-
- sk = __inet_lookup_established(net, &dccp_hashinfo,
- iph->daddr, dh->dccph_dport,
- iph->saddr, ntohs(dh->dccph_sport),
- inet_iif(skb), 0);
- if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return -ENOENT;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return 0;
- }
- seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- dccp_req_err(sk, seq);
- return 0;
- }
-
- bh_lock_sock(sk);
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- if (sock_owned_by_user(sk))
- __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- dp = dccp_sk(sk);
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
- !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- switch (type) {
- case ICMP_REDIRECT:
- if (!sock_owned_by_user(sk))
- dccp_do_redirect(skb, sk);
- goto out;
- case ICMP_SOURCE_QUENCH:
- /* Just silently ignore these. */
- goto out;
- case ICMP_PARAMETERPROB:
- err = EPROTO;
- break;
- case ICMP_DEST_UNREACH:
- if (code > NR_ICMP_UNREACH)
- goto out;
-
- if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- if (!sock_owned_by_user(sk))
- dccp_do_pmtu_discovery(sk, iph, info);
- goto out;
- }
-
- err = icmp_err_convert[code].errno;
- break;
- case ICMP_TIME_EXCEEDED:
- err = EHOSTUNREACH;
- break;
- default:
- goto out;
- }
-
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- case DCCP_RESPOND:
- if (!sock_owned_by_user(sk)) {
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
-
- sk_error_report(sk);
-
- dccp_done(sk);
- } else {
- WRITE_ONCE(sk->sk_err_soft, err);
- }
- goto out;
- }
-
- /* If we've already connected we will keep trying
- * until we time out, or the user gives up.
- *
- * rfc1122 4.2.3.9 allows to consider as hard errors
- * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
- * but it is obsoleted by pmtu discovery).
- *
- * Note, that in modern internet, where routing is unreliable
- * and in each dark corner broken firewalls sit, sending random
- * errors ordered by their masters even this two messages finally lose
- * their original sense (even Linux sends invalid PORT_UNREACHs)
- *
- * Now we are in compliance with RFCs.
- * --ANK (980905)
- */
-
- if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) {
- sk->sk_err = err;
- sk_error_report(sk);
- } else { /* Only an error on timeout */
- WRITE_ONCE(sk->sk_err_soft, err);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
- return 0;
-}
-
-static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
- __be32 src, __be32 dst)
-{
- return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
-{
- const struct inet_sock *inet = inet_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb,
- inet->inet_saddr,
- inet->inet_daddr);
-}
-EXPORT_SYMBOL_GPL(dccp_v4_send_check);
-
-static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
-{
- return secure_dccp_sequence_number(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport);
-}
-
-/*
- * The three way handshake has completed - we got a valid ACK or DATAACK -
- * now create the new socket.
- *
- * This is the equivalent of TCP's tcp_v4_syn_recv_sock
- */
-struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req)
-{
- struct inet_request_sock *ireq;
- struct inet_sock *newinet;
- struct sock *newsk;
-
- if (sk_acceptq_is_full(sk))
- goto exit_overflow;
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto exit_nonewsk;
-
- newinet = inet_sk(newsk);
- ireq = inet_rsk(req);
- RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
- newinet->mc_index = inet_iif(skb);
- newinet->mc_ttl = ip_hdr(skb)->ttl;
- atomic_set(&newinet->inet_id, get_random_u16());
-
- if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
- goto put_and_exit;
-
- sk_setup_caps(newsk, dst);
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- if (__inet_inherit_port(sk, newsk) < 0)
- goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
- if (*own_req)
- ireq->ireq_opt = NULL;
- else
- newinet->inet_opt = NULL;
- return newsk;
-
-exit_overflow:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-exit_nonewsk:
- dst_release(dst);
-exit:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
- return NULL;
-put_and_exit:
- newinet->inet_opt = NULL;
- inet_csk_prepare_forced_close(newsk);
- dccp_done(newsk);
- goto exit;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
-
-static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
- struct sk_buff *skb)
-{
- struct rtable *rt;
- const struct iphdr *iph = ip_hdr(skb);
- struct flowi4 fl4 = {
- .flowi4_oif = inet_iif(skb),
- .daddr = iph->saddr,
- .saddr = iph->daddr,
- .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))),
- .flowi4_scope = ip_sock_rt_scope(sk),
- .flowi4_proto = sk->sk_protocol,
- .fl4_sport = dccp_hdr(skb)->dccph_dport,
- .fl4_dport = dccp_hdr(skb)->dccph_sport,
- };
-
- security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
- rt = ip_route_output_flow(net, &fl4, sk);
- if (IS_ERR(rt)) {
- IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
-
- return &rt->dst;
-}
-
-static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req)
-{
- int err = -1;
- struct sk_buff *skb;
- struct dst_entry *dst;
- struct flowi4 fl4;
-
- dst = inet_csk_route_req(sk, &fl4, req);
- if (dst == NULL)
- goto out;
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
- ireq->ir_rmt_addr);
- rcu_read_lock();
- err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt),
- READ_ONCE(inet_sk(sk)->tos));
- rcu_read_unlock();
- err = net_xmit_eval(err);
- }
-
-out:
- dst_release(dst);
- return err;
-}
-
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb,
- enum sk_rst_reason reason)
-{
- int err;
- const struct iphdr *rxiph;
- struct sk_buff *skb;
- struct dst_entry *dst;
- struct net *net = dev_net(skb_dst(rxskb)->dev);
- struct dccp_v4_pernet *pn;
- struct sock *ctl_sk;
-
- /* Never send a reset in response to a reset. */
- if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
- return;
-
- pn = net_generic(net, dccp_v4_pernet_id);
- ctl_sk = pn->v4_ctl_sk;
- dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
- if (dst == NULL)
- return;
-
- skb = dccp_ctl_make_reset(ctl_sk, rxskb);
- if (skb == NULL)
- goto out;
-
- rxiph = ip_hdr(rxskb);
- dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
- rxiph->daddr);
- skb_dst_set(skb, dst_clone(dst));
-
- local_bh_disable();
- bh_lock_sock(ctl_sk);
- err = ip_build_and_send_pkt(skb, ctl_sk,
- rxiph->daddr, rxiph->saddr, NULL,
- inet_sk(ctl_sk)->tos);
- bh_unlock_sock(ctl_sk);
-
- if (net_xmit_eval(err) == 0) {
- __DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- __DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
- }
- local_bh_enable();
-out:
- dst_release(dst);
-}
-
-static void dccp_v4_reqsk_destructor(struct request_sock *req)
-{
- dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
- kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
-}
-
-void dccp_syn_ack_timeout(const struct request_sock *req)
-{
-}
-EXPORT_SYMBOL(dccp_syn_ack_timeout);
-
-static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
- .family = PF_INET,
- .obj_size = sizeof(struct dccp_request_sock),
- .rtx_syn_ack = dccp_v4_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v4_reqsk_destructor,
- .send_reset = dccp_v4_ctl_send_reset,
- .syn_ack_timeout = dccp_syn_ack_timeout,
-};
-
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct inet_request_sock *ireq;
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-
- /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
- if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
- return 0; /* discard, don't send a reset here */
-
- if (dccp_bad_service_code(sk, service)) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- if (sk_acceptq_is_full(sk))
- goto drop;
-
- req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
- if (req == NULL)
- goto drop;
-
- if (dccp_reqsk_init(req, dccp_sk(sk), skb))
- goto drop_and_free;
-
- dreq = dccp_rsk(req);
- if (dccp_parse_options(sk, dreq, skb))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
- sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->ir_mark = inet_request_mark(sk, skb);
- ireq->ireq_family = AF_INET;
- ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
- */
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_gsr = dreq->dreq_isr;
- dreq->dreq_iss = dccp_v4_init_sequence(skb);
- dreq->dreq_gss = dreq->dreq_iss;
- dreq->dreq_service = service;
-
- if (dccp_v4_send_response(sk, req))
- goto drop_and_free;
-
- if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT)))
- reqsk_free(req);
- else
- reqsk_put(req);
-
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- return -1;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
-
-int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dh, skb->len))
- goto reset;
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
-
- if (dccp_rcv_state_process(sk, skb, dh, skb->len))
- goto reset;
- return 0;
-
-reset:
- dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
- kfree_skb(skb);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
-
-/**
- * dccp_invalid_packet - check for malformed packets
- * @skb: Packet to validate
- *
- * Implements RFC 4340, 8.5: Step 1: Check header basics
- * Packets that fail these checks are ignored and do not receive Resets.
- */
-int dccp_invalid_packet(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- unsigned int cscov;
- u8 dccph_doff;
-
- if (skb->pkt_type != PACKET_HOST)
- return 1;
-
- /* If the packet is shorter than 12 bytes, drop packet and return */
- if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
- DCCP_WARN("pskb_may_pull failed\n");
- return 1;
- }
-
- dh = dccp_hdr(skb);
-
- /* If P.type is not understood, drop packet and return */
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- DCCP_WARN("invalid packet type\n");
- return 1;
- }
-
- /*
- * If P.Data Offset is too small for packet type, drop packet and return
- */
- dccph_doff = dh->dccph_doff;
- if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff);
- return 1;
- }
- /*
- * If P.Data Offset is too large for packet, drop packet and return
- */
- if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
- DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
- return 1;
- }
- dh = dccp_hdr(skb);
- /*
- * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
- * has short sequence numbers), drop packet and return
- */
- if ((dh->dccph_type < DCCP_PKT_DATA ||
- dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
- DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
- dccp_packet_name(dh->dccph_type));
- return 1;
- }
-
- /*
- * If P.CsCov is too large for the packet size, drop packet and return.
- * This must come _before_ checksumming (not as RFC 4340 suggests).
- */
- cscov = dccp_csum_coverage(skb);
- if (cscov > skb->len) {
- DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
- dh->dccph_cscov, skb->len);
- return 1;
- }
-
- /* If header checksum is incorrect, drop packet and return.
- * (This step is completed in the AF-dependent functions.) */
- skb->csum = skb_checksum(skb, 0, cscov, 0);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_invalid_packet);
-
-/* this is called when real data arrives */
-static int dccp_v4_rcv(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- const struct iphdr *iph;
- bool refcounted;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- iph = ip_hdr(skb);
- /* Step 1: If header checksum is incorrect, drop packet and return */
- if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
- dccp_packet_name(dh->dccph_type),
- &iph->saddr, ntohs(dh->dccph_sport),
- &iph->daddr, ntohs(dh->dccph_dport),
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
-
- if (dccp_packet_without_ack(skb)) {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- dccp_pr_debug_cat("\n");
- } else {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
- dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- }
-
-lookup:
- sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
- dh->dccph_sport, dh->dccph_dport, 0, &refcounted);
- if (!sk) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk;
-
- sk = req->rsk_listener;
- if (unlikely(sk->sk_state != DCCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
- }
- sock_hold(sk);
- refcounted = true;
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk) {
- reqsk_put(req);
- goto discard_and_relse;
- }
- if (nsk == sk) {
- reqsk_put(req);
- } else if (dccp_child_process(sk, nsk, skb)) {
- dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
- goto discard_and_relse;
- } else {
- sock_put(sk);
- return 0;
- }
- }
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: "Such packets SHOULD be reported using Data Dropped
- * options (Section 11.7) with Drop Code 0, Protocol
- * Constraints." */
- goto discard_and_relse;
- }
-
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
- nf_reset_ct(skb);
-
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted);
-
-no_dccp_socket:
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- if (refcounted)
- sock_put(sk);
- goto discard_it;
-}
-
-static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v4_conn_request,
- .syn_recv_sock = dccp_v4_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ip_setsockopt,
- .getsockopt = ip_getsockopt,
-};
-
-static int dccp_v4_init_sock(struct sock *sk)
-{
- static __u8 dccp_v4_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v4_ctl_sock_initialized))
- dccp_v4_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
- }
-
- return err;
-}
-
-static struct timewait_sock_ops dccp_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct inet_timewait_sock),
-};
-
-static struct proto dccp_v4_prot = {
- .name = "DCCP",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v4_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v4_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v4_do_rcv,
- .hash = inet_hash,
- .unhash = inet_unhash,
- .accept = inet_csk_accept,
- .get_port = inet_csk_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp_sock),
- .slab_flags = SLAB_TYPESAFE_BY_RCU,
- .rsk_prot = &dccp_request_sock_ops,
- .twsk_prot = &dccp_timewait_sock_ops,
- .h.hashinfo = &dccp_hashinfo,
-};
-
-static const struct net_protocol dccp_v4_protocol = {
- .handler = dccp_v4_rcv,
- .err_handler = dccp_v4_err,
- .no_policy = 1,
- .icmp_strict_tag_validation = 1,
-};
-
-static const struct proto_ops inet_dccp_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
- .poll = dccp_poll,
- .ioctl = inet_ioctl,
- .gettstamp = sock_gettstamp,
- /* FIXME: work on inet_listen to rename it to sock_common_listen */
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
-};
-
-static struct inet_protosw dccp_v4_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v4_prot,
- .ops = &inet_dccp_ops,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __net_init dccp_v4_init_net(struct net *net)
-{
- struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id);
-
- if (dccp_hashinfo.bhash == NULL)
- return -ESOCKTNOSUPPORT;
-
- return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET,
- SOCK_DCCP, IPPROTO_DCCP, net);
-}
-
-static void __net_exit dccp_v4_exit_net(struct net *net)
-{
- struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id);
-
- inet_ctl_sock_destroy(pn->v4_ctl_sk);
-}
-
-static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&dccp_hashinfo);
-}
-
-static struct pernet_operations dccp_v4_ops = {
- .init = dccp_v4_init_net,
- .exit = dccp_v4_exit_net,
- .exit_batch = dccp_v4_exit_batch,
- .id = &dccp_v4_pernet_id,
- .size = sizeof(struct dccp_v4_pernet),
-};
-
-static int __init dccp_v4_init(void)
-{
- int err = proto_register(&dccp_v4_prot, 1);
-
- if (err)
- goto out;
-
- inet_register_protosw(&dccp_v4_protosw);
-
- err = register_pernet_subsys(&dccp_v4_ops);
- if (err)
- goto out_destroy_ctl_sock;
-
- err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- if (err)
- goto out_proto_unregister;
-
-out:
- return err;
-out_proto_unregister:
- unregister_pernet_subsys(&dccp_v4_ops);
-out_destroy_ctl_sock:
- inet_unregister_protosw(&dccp_v4_protosw);
- proto_unregister(&dccp_v4_prot);
- goto out;
-}
-
-static void __exit dccp_v4_exit(void)
-{
- inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- unregister_pernet_subsys(&dccp_v4_ops);
- inet_unregister_protosw(&dccp_v4_protosw);
- proto_unregister(&dccp_v4_prot);
-}
-
-module_init(dccp_v4_init);
-module_exit(dccp_v4_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
deleted file mode 100644
index e24dbffabfc1..000000000000
--- a/net/dccp/ipv6.c
+++ /dev/null
@@ -1,1174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DCCP over IPv6
- * Linux INET6 implementation
- *
- * Based on net/dccp6/ipv6.c
- *
- * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/xfrm.h>
-#include <linux/string.h>
-
-#include <net/addrconf.h>
-#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/inet6_connection_sock.h>
-#include <net/inet6_hashtables.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/ip6_checksum.h>
-#include <net/xfrm.h>
-#include <net/secure_seq.h>
-#include <net/netns/generic.h>
-#include <net/sock.h>
-#include <net/rstreason.h>
-
-#include "dccp.h"
-#include "ipv6.h"
-#include "feat.h"
-
-struct dccp_v6_pernet {
- struct sock *v6_ctl_sk;
-};
-
-static unsigned int dccp_v6_pernet_id __read_mostly;
-
-/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */
-
-static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
-static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
-
-/* add pseudo-header to DCCP checksum stored in skb->csum */
-static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
-}
-
-static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
-{
- return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
- ipv6_hdr(skb)->saddr.s6_addr32,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport );
-
-}
-
-static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info)
-{
- const struct ipv6hdr *hdr;
- const struct dccp_hdr *dh;
- struct dccp_sock *dp;
- struct ipv6_pinfo *np;
- struct sock *sk;
- int err;
- __u64 seq;
- struct net *net = dev_net(skb->dev);
-
- if (!pskb_may_pull(skb, offset + sizeof(*dh)))
- return -EINVAL;
- dh = (struct dccp_hdr *)(skb->data + offset);
- if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh)))
- return -EINVAL;
- hdr = (const struct ipv6hdr *)skb->data;
- dh = (struct dccp_hdr *)(skb->data + offset);
-
- sk = __inet6_lookup_established(net, &dccp_hashinfo,
- &hdr->daddr, dh->dccph_dport,
- &hdr->saddr, ntohs(dh->dccph_sport),
- inet6_iif(skb), 0);
-
- if (!sk) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return -ENOENT;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return 0;
- }
- seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- dccp_req_err(sk, seq);
- return 0;
- }
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- dp = dccp_sk(sk);
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
- !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- np = inet6_sk(sk);
-
- if (type == NDISC_REDIRECT) {
- if (!sock_owned_by_user(sk)) {
- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst)
- dst->ops->redirect(dst, sk, skb);
- }
- goto out;
- }
-
- if (type == ICMPV6_PKT_TOOBIG) {
- struct dst_entry *dst = NULL;
-
- if (!ip6_sk_accept_pmtu(sk))
- goto out;
-
- if (sock_owned_by_user(sk))
- goto out;
- if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
- goto out;
-
- dst = inet6_csk_update_pmtu(sk, ntohl(info));
- if (!dst)
- goto out;
-
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
- dccp_sync_mss(sk, dst_mtu(dst));
- goto out;
- }
-
- icmpv6_err_convert(type, code, &err);
-
- /* Might be for an request_sock */
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- case DCCP_RESPOND: /* Cannot happen.
- It can, it SYNs are crossed. --ANK */
- if (!sock_owned_by_user(sk)) {
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
- /*
- * Wake people up to see the error
- * (see connect in sock.c)
- */
- sk_error_report(sk);
- dccp_done(sk);
- } else {
- WRITE_ONCE(sk->sk_err_soft, err);
- }
- goto out;
- }
-
- if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
- sk->sk_err = err;
- sk_error_report(sk);
- } else {
- WRITE_ONCE(sk->sk_err_soft, err);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
- return 0;
-}
-
-
-static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *skb;
- struct in6_addr *final_p, final;
- struct flowi6 fl6;
- int err = -1;
- struct dst_entry *dst;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.daddr = ireq->ir_v6_rmt_addr;
- fl6.saddr = ireq->ir_v6_loc_addr;
- fl6.flowlabel = 0;
- fl6.flowi6_oif = ireq->ir_iif;
- fl6.fl6_dport = ireq->ir_rmt_port;
- fl6.fl6_sport = htons(ireq->ir_num);
- security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
-
-
- rcu_read_lock();
- final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
- rcu_read_unlock();
-
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- dst = NULL;
- goto done;
- }
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- struct dccp_hdr *dh = dccp_hdr(skb);
- struct ipv6_txoptions *opt;
-
- dh->dccph_checksum = dccp_v6_csum_finish(skb,
- &ireq->ir_v6_loc_addr,
- &ireq->ir_v6_rmt_addr);
- fl6.daddr = ireq->ir_v6_rmt_addr;
- rcu_read_lock();
- opt = ireq->ipv6_opt;
- if (!opt)
- opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt,
- np->tclass, READ_ONCE(sk->sk_priority));
- rcu_read_unlock();
- err = net_xmit_eval(err);
- }
-
-done:
- dst_release(dst);
- return err;
-}
-
-static void dccp_v6_reqsk_destructor(struct request_sock *req)
-{
- dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
- kfree(inet_rsk(req)->ipv6_opt);
- kfree_skb(inet_rsk(req)->pktopts);
-}
-
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb,
- enum sk_rst_reason reason)
-{
- const struct ipv6hdr *rxip6h;
- struct sk_buff *skb;
- struct flowi6 fl6;
- struct net *net = dev_net(skb_dst(rxskb)->dev);
- struct dccp_v6_pernet *pn;
- struct sock *ctl_sk;
- struct dst_entry *dst;
-
- if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (!ipv6_unicast_destination(rxskb))
- return;
-
- pn = net_generic(net, dccp_v6_pernet_id);
- ctl_sk = pn->v6_ctl_sk;
- skb = dccp_ctl_make_reset(ctl_sk, rxskb);
- if (skb == NULL)
- return;
-
- rxip6h = ipv6_hdr(rxskb);
- dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
- &rxip6h->daddr);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.daddr = rxip6h->saddr;
- fl6.saddr = rxip6h->daddr;
-
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.flowi6_oif = inet6_iif(rxskb);
- fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
- fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6));
-
- /* sk = NULL, but it is safe for now. RST socket required. */
- dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
- if (!IS_ERR(dst)) {
- skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0);
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
- return;
- }
-
- kfree_skb(skb);
-}
-
-static struct request_sock_ops dccp6_request_sock_ops = {
- .family = AF_INET6,
- .obj_size = sizeof(struct dccp6_request_sock),
- .rtx_syn_ack = dccp_v6_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v6_reqsk_destructor,
- .send_reset = dccp_v6_ctl_send_reset,
- .syn_ack_timeout = dccp_syn_ack_timeout,
-};
-
-static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- struct inet_request_sock *ireq;
- struct ipv6_pinfo *np = inet6_sk(sk);
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_conn_request(sk, skb);
-
- if (!ipv6_unicast_destination(skb))
- return 0; /* discard, don't send a reset here */
-
- if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
- __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
- return 0;
- }
-
- if (dccp_bad_service_code(sk, service)) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * There are no SYN attacks on IPv6, yet...
- */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- if (sk_acceptq_is_full(sk))
- goto drop;
-
- req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
- if (req == NULL)
- goto drop;
-
- if (dccp_reqsk_init(req, dccp_sk(sk), skb))
- goto drop_and_free;
-
- dreq = dccp_rsk(req);
- if (dccp_parse_options(sk, dreq, skb))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
- ireq->ir_rmt_addr = LOOPBACK4_IPV6;
- ireq->ir_loc_addr = LOOPBACK4_IPV6;
-
- ireq->ireq_family = AF_INET6;
- ireq->ir_mark = inet_request_mark(sk, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
- refcount_inc(&skb->users);
- ireq->pktopts = skb;
- }
- ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if);
-
- /* So that link locals have meaning */
- if (!ireq->ir_iif &&
- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq->ir_iif = inet6_iif(skb);
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
- */
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_gsr = dreq->dreq_isr;
- dreq->dreq_iss = dccp_v6_init_sequence(skb);
- dreq->dreq_gss = dreq->dreq_iss;
- dreq->dreq_service = service;
-
- if (dccp_v6_send_response(sk, req))
- goto drop_and_free;
-
- if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT)))
- reqsk_free(req);
- else
- reqsk_put(req);
-
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- return -1;
-}
-
-static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *newnp;
- const struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_txoptions *opt;
- struct inet_sock *newinet;
- struct dccp6_sock *newdp6;
- struct sock *newsk;
-
- if (skb->protocol == htons(ETH_P_IP)) {
- /*
- * v6 mapped
- */
- newsk = dccp_v4_request_recv_sock(sk, skb, req, dst,
- req_unhash, own_req);
- if (newsk == NULL)
- return NULL;
-
- newdp6 = (struct dccp6_sock *)newsk;
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- newnp->saddr = newsk->sk_v6_rcv_saddr;
-
- inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
- newsk->sk_backlog_rcv = dccp_v4_do_rcv;
- newnp->pktoptions = NULL;
- newnp->opt = NULL;
- newnp->ipv6_mc_list = NULL;
- newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
- newnp->mcast_oif = inet_iif(skb);
- newnp->mcast_hops = ip_hdr(skb)->ttl;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks count
- * here, dccp_create_openreq_child now does this for us, see the comment in
- * that function for the gory details. -acme
- */
-
- /* It is tricky place. Until this moment IPv4 tcp
- worked with IPv6 icsk.icsk_af_ops.
- Sync it now.
- */
- dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
-
- return newsk;
- }
-
-
- if (sk_acceptq_is_full(sk))
- goto out_overflow;
-
- if (!dst) {
- struct flowi6 fl6;
-
- dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP);
- if (!dst)
- goto out;
- }
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto out_nonewsk;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks
- * count here, dccp_create_openreq_child now does this for us, see the
- * comment in that function for the gory details. -acme
- */
-
- ip6_dst_store(newsk, dst, NULL, NULL);
- newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
- NETIF_F_TSO);
- newdp6 = (struct dccp6_sock *)newsk;
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- newnp->saddr = ireq->ir_v6_loc_addr;
-
- /* Now IPv6 options...
-
- First: no IPv4 options.
- */
- newinet->inet_opt = NULL;
-
- /* Clone RX bits */
- newnp->rxopt.all = np->rxopt.all;
-
- newnp->ipv6_mc_list = NULL;
- newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
- newnp->pktoptions = NULL;
- newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
-
- /*
- * Clone native IPv6 options from listening socket (if any)
- *
- * Yes, keeping reference count would be much more clever, but we make
- * one more one thing there: reattach optmem to newsk.
- */
- opt = ireq->ipv6_opt;
- if (!opt)
- opt = rcu_dereference(np->opt);
- if (opt) {
- opt = ipv6_dup_options(newsk, opt);
- RCU_INIT_POINTER(newnp->opt, opt);
- }
- inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (opt)
- inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
- opt->opt_flen;
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- if (__inet_inherit_port(sk, newsk) < 0) {
- inet_csk_prepare_forced_close(newsk);
- dccp_done(newsk);
- goto out;
- }
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL);
- /* Clone pktoptions received with SYN, if we own the req */
- if (*own_req && ireq->pktopts) {
- newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
- consume_skb(ireq->pktopts);
- ireq->pktopts = NULL;
- }
-
- return newsk;
-
-out_overflow:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out_nonewsk:
- dst_release(dst);
-out:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
- return NULL;
-}
-
-/* The socket must have it's spinlock held when we get
- * here.
- *
- * We have a potential double-lock case here, so even when
- * doing backlog processing we use the BH locking scheme.
- * This is because we cannot sleep with the original spinlock
- * held.
- */
-static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *opt_skb = NULL;
-
- /* Imagine: socket is IPv6. IPv4 packet arrives,
- goes to IPv4 receive handler and backlogged.
- From backlog it always goes here. Kerboom...
- Fortunately, dccp_rcv_established and rcv_established
- handle them correctly, but it is not case with
- dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
- */
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_do_rcv(sk, skb);
-
- if (sk_filter(sk, skb))
- goto discard;
-
- /*
- * socket locking is here for SMP purposes as backlog rcv is currently
- * called with bh processing disabled.
- */
-
- /* Do Stevens' IPV6_PKTOPTIONS.
-
- Yes, guys, it is the only place in our code, where we
- may make it not affecting IPv4.
- The rest of code is protocol independent,
- and I do not like idea to uglify IPv4.
-
- Actually, all the idea behind IPV6_PKTOPTIONS
- looks not very well thought. For now we latch
- options, received in the last packet, enqueued
- by tcp. Feel free to propose better solution.
- --ANK (980728)
- */
- if (np->rxopt.all && sk->sk_state != DCCP_LISTEN)
- opt_skb = skb_clone_and_charge_r(skb, sk);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb)
- goto ipv6_pktoptions;
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
-
- if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb)
- goto ipv6_pktoptions;
- return 0;
-
-reset:
- dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
-discard:
- if (opt_skb != NULL)
- __kfree_skb(opt_skb);
- kfree_skb(skb);
- return 0;
-
-/* Handling IPV6_PKTOPTIONS skb the similar
- * way it's done for net/ipv6/tcp_ipv6.c
- */
-ipv6_pktoptions:
- if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) {
- if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
- WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb));
- if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
- if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
- np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
- if (inet6_test_bit(REPFLOW, sk))
- np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
- if (ipv6_opt_accepted(sk, opt_skb,
- &DCCP_SKB_CB(opt_skb)->header.h6)) {
- memmove(IP6CB(opt_skb),
- &DCCP_SKB_CB(opt_skb)->header.h6,
- sizeof(struct inet6_skb_parm));
- opt_skb = xchg(&np->pktoptions, opt_skb);
- } else {
- __kfree_skb(opt_skb);
- opt_skb = xchg(&np->pktoptions, NULL);
- }
- }
-
- kfree_skb(opt_skb);
- return 0;
-}
-
-static int dccp_v6_rcv(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- bool refcounted;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- /* Step 1: If header checksum is incorrect, drop packet and return. */
- if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- if (dccp_packet_without_ack(skb))
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- else
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
-
-lookup:
- sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
- dh->dccph_sport, dh->dccph_dport,
- inet6_iif(skb), 0, &refcounted);
- if (!sk) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk;
-
- sk = req->rsk_listener;
- if (unlikely(sk->sk_state != DCCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
- }
- sock_hold(sk);
- refcounted = true;
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk) {
- reqsk_put(req);
- goto discard_and_relse;
- }
- if (nsk == sk) {
- reqsk_put(req);
- } else if (dccp_child_process(sk, nsk, skb)) {
- dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
- goto discard_and_relse;
- } else {
- sock_put(sk);
- return 0;
- }
- }
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
- goto discard_and_relse;
- }
-
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
- nf_reset_ct(skb);
-
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4,
- refcounted) ? -1 : 0;
-
-no_dccp_socket:
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- if (refcounted)
- sock_put(sk);
- goto discard_it;
-}
-
-static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
-{
- struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p, final;
- struct ipv6_txoptions *opt;
- struct flowi6 fl6;
- struct dst_entry *dst;
- int addr_type;
- int err;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- if (usin->sin6_family != AF_INET6)
- return -EAFNOSUPPORT;
-
- memset(&fl6, 0, sizeof(fl6));
-
- if (inet6_test_bit(SNDFLOW, sk)) {
- fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl6.flowlabel);
- if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
- struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (IS_ERR(flowlabel))
- return -EINVAL;
- fl6_sock_release(flowlabel);
- }
- }
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 1;
-
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if (addr_type & IPV6_ADDR_MULTICAST)
- return -ENETUNREACH;
-
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- usin->sin6_scope_id) {
- /* If interface is set while binding, indices
- * must coincide.
- */
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id)
- return -EINVAL;
-
- sk->sk_bound_dev_if = usin->sin6_scope_id;
- }
-
- /* Connect to link-local address requires an interface */
- if (!sk->sk_bound_dev_if)
- return -EINVAL;
- }
-
- sk->sk_v6_daddr = usin->sin6_addr;
- np->flow_label = fl6.flowlabel;
-
- /*
- * DCCP over IPv4
- */
- if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = icsk->icsk_ext_hdr_len;
- struct sockaddr_in sin;
-
- net_dbg_ratelimited("connect: ipv4 mapped\n");
-
- if (ipv6_only_sock(sk))
- return -ENETUNREACH;
-
- sin.sin_family = AF_INET;
- sin.sin_port = usin->sin6_port;
- sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
-
- icsk->icsk_af_ops = &dccp_ipv6_mapped;
- sk->sk_backlog_rcv = dccp_v4_do_rcv;
-
- err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
- if (err) {
- icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &dccp_ipv6_af_ops;
- sk->sk_backlog_rcv = dccp_v6_do_rcv;
- goto failure;
- }
- np->saddr = sk->sk_v6_rcv_saddr;
- return err;
- }
-
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
- saddr = &sk->sk_v6_rcv_saddr;
-
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.daddr = sk->sk_v6_daddr;
- fl6.saddr = saddr ? *saddr : np->saddr;
- fl6.flowi6_oif = sk->sk_bound_dev_if;
- fl6.fl6_dport = usin->sin6_port;
- fl6.fl6_sport = inet->inet_sport;
- security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
-
- opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
- final_p = fl6_update_dst(&fl6, opt, &final);
-
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- goto failure;
- }
-
- if (saddr == NULL) {
- saddr = &fl6.saddr;
-
- err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
- if (err)
- goto failure;
- }
-
- /* set the source address */
- np->saddr = *saddr;
- inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-
- ip6_dst_store(sk, dst, NULL, NULL);
-
- icsk->icsk_ext_hdr_len = 0;
- if (opt)
- icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
-
- inet->inet_dport = usin->sin6_port;
-
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet6_hash_connect(&dccp_death_row, sk);
- if (err)
- goto late_failure;
-
- dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport);
- err = dccp_connect(sk);
- if (err)
- goto late_failure;
-
- return 0;
-
-late_failure:
- dccp_set_state(sk, DCCP_CLOSED);
- inet_bhash2_reset_saddr(sk);
- __sk_dst_reset(sk);
-failure:
- inet->inet_dport = 0;
- sk->sk_route_caps = 0;
- return err;
-}
-
-static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
- .queue_xmit = inet6_csk_xmit,
- .send_check = dccp_v6_send_check,
- .rebuild_header = inet6_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct ipv6hdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
-};
-
-/*
- * DCCP over IPv4 via INET6 API
- */
-static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
-};
-
-static void dccp_v6_sk_destruct(struct sock *sk)
-{
- dccp_destruct_common(sk);
- inet6_sock_destruct(sk);
-}
-
-/* NOTE: A lot of things set to zero explicitly by call to
- * sk_alloc() so need not be done here.
- */
-static int dccp_v6_init_sock(struct sock *sk)
-{
- static __u8 dccp_v6_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v6_ctl_sock_initialized))
- dccp_v6_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
- sk->sk_destruct = dccp_v6_sk_destruct;
- }
-
- return err;
-}
-
-static struct timewait_sock_ops dccp6_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
-};
-
-static struct proto dccp_v6_prot = {
- .name = "DCCPv6",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v6_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v6_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v6_do_rcv,
- .hash = inet6_hash,
- .unhash = inet_unhash,
- .accept = inet_csk_accept,
- .get_port = inet_csk_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp6_sock),
- .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6),
- .slab_flags = SLAB_TYPESAFE_BY_RCU,
- .rsk_prot = &dccp6_request_sock_ops,
- .twsk_prot = &dccp6_timewait_sock_ops,
- .h.hashinfo = &dccp_hashinfo,
-};
-
-static const struct inet6_protocol dccp_v6_protocol = {
- .handler = dccp_v6_rcv,
- .err_handler = dccp_v6_err,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
-};
-
-static const struct proto_ops inet6_dccp_ops = {
- .family = PF_INET6,
- .owner = THIS_MODULE,
- .release = inet6_release,
- .bind = inet6_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet6_getname,
- .poll = dccp_poll,
- .ioctl = inet6_ioctl,
- .gettstamp = sock_gettstamp,
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = inet6_compat_ioctl,
-#endif
-};
-
-static struct inet_protosw dccp_v6_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v6_prot,
- .ops = &inet6_dccp_ops,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __net_init dccp_v6_init_net(struct net *net)
-{
- struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id);
-
- if (dccp_hashinfo.bhash == NULL)
- return -ESOCKTNOSUPPORT;
-
- return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6,
- SOCK_DCCP, IPPROTO_DCCP, net);
-}
-
-static void __net_exit dccp_v6_exit_net(struct net *net)
-{
- struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id);
-
- inet_ctl_sock_destroy(pn->v6_ctl_sk);
-}
-
-static struct pernet_operations dccp_v6_ops = {
- .init = dccp_v6_init_net,
- .exit = dccp_v6_exit_net,
- .id = &dccp_v6_pernet_id,
- .size = sizeof(struct dccp_v6_pernet),
-};
-
-static int __init dccp_v6_init(void)
-{
- int err = proto_register(&dccp_v6_prot, 1);
-
- if (err)
- goto out;
-
- inet6_register_protosw(&dccp_v6_protosw);
-
- err = register_pernet_subsys(&dccp_v6_ops);
- if (err)
- goto out_destroy_ctl_sock;
-
- err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- if (err)
- goto out_unregister_proto;
-
-out:
- return err;
-out_unregister_proto:
- unregister_pernet_subsys(&dccp_v6_ops);
-out_destroy_ctl_sock:
- inet6_unregister_protosw(&dccp_v6_protosw);
- proto_unregister(&dccp_v6_prot);
- goto out;
-}
-
-static void __exit dccp_v6_exit(void)
-{
- inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- unregister_pernet_subsys(&dccp_v6_ops);
- inet6_unregister_protosw(&dccp_v6_protosw);
- proto_unregister(&dccp_v6_prot);
-}
-
-module_init(dccp_v6_init);
-module_exit(dccp_v6_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
deleted file mode 100644
index c5d14c48def1..000000000000
--- a/net/dccp/ipv6.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _DCCP_IPV6_H
-#define _DCCP_IPV6_H
-/*
- * net/dccp/ipv6.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- */
-
-#include <linux/dccp.h>
-#include <linux/ipv6.h>
-
-struct dccp6_sock {
- struct dccp_sock dccp;
- struct ipv6_pinfo inet6;
-};
-
-struct dccp6_request_sock {
- struct dccp_request_sock dccp;
-};
-
-struct dccp6_timewait_sock {
- struct inet_timewait_sock inet;
-};
-
-#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
deleted file mode 100644
index fecc8190064f..000000000000
--- a/net/dccp/minisocks.c
+++ /dev/null
@@ -1,266 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/minisocks.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/timer.h>
-
-#include <net/sock.h>
-#include <net/xfrm.h>
-#include <net/inet_timewait_sock.h>
-#include <net/rstreason.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-struct inet_timewait_death_row dccp_death_row = {
- .tw_refcount = REFCOUNT_INIT(1),
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .hashinfo = &dccp_hashinfo,
-};
-
-EXPORT_SYMBOL_GPL(dccp_death_row);
-
-void dccp_time_wait(struct sock *sk, int state, int timeo)
-{
- struct inet_timewait_sock *tw;
-
- tw = inet_twsk_alloc(sk, &dccp_death_row, state);
-
- if (tw != NULL) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == PF_INET6) {
- tw->tw_v6_daddr = sk->sk_v6_daddr;
- tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
- tw->tw_ipv6only = sk->sk_ipv6only;
- }
-#endif
-
- /* Get the TIME_WAIT timeout firing. */
- if (timeo < rto)
- timeo = rto;
-
- if (state == DCCP_TIME_WAIT)
- timeo = DCCP_TIMEWAIT_LEN;
-
- /* Linkage updates.
- * Note that access to tw after this point is illegal.
- */
- inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo);
- } else {
- /* Sorry, if we're out of memory, just CLOSE this
- * socket up. We've got bigger problems than
- * non-graceful socket closings.
- */
- DCCP_WARN("time wait bucket table overflow\n");
- }
-
- dccp_done(sk);
-}
-
-struct sock *dccp_create_openreq_child(const struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb)
-{
- /*
- * Step 3: Process LISTEN state
- *
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- */
- struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
-
- if (newsk != NULL) {
- struct dccp_request_sock *dreq = dccp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct dccp_sock *newdp = dccp_sk(newsk);
-
- newdp->dccps_role = DCCP_ROLE_SERVER;
- newdp->dccps_hc_rx_ackvec = NULL;
- newdp->dccps_service_list = NULL;
- newdp->dccps_hc_rx_ccid = NULL;
- newdp->dccps_hc_tx_ccid = NULL;
- newdp->dccps_service = dreq->dreq_service;
- newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
- newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
- newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
-
- INIT_LIST_HEAD(&newdp->dccps_featneg);
- /*
- * Step 3: Process LISTEN state
- *
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR from packet (or Init Cookies)
- *
- * Setting AWL/AWH and SWL/SWH happens as part of the feature
- * activation below, as these windows all depend on the local
- * and remote Sequence Window feature values (7.5.2).
- */
- newdp->dccps_iss = dreq->dreq_iss;
- newdp->dccps_gss = dreq->dreq_gss;
- newdp->dccps_gar = newdp->dccps_iss;
- newdp->dccps_isr = dreq->dreq_isr;
- newdp->dccps_gsr = dreq->dreq_gsr;
-
- /*
- * Activate features: initialise CCIDs, sequence windows etc.
- */
- if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
- sk_free_unlock_clone(newsk);
- return NULL;
- }
- dccp_init_xmit_timers(newsk);
-
- __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS);
- }
- return newsk;
-}
-
-EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
-
-/*
- * Process an incoming packet for RESPOND sockets represented
- * as an request_sock.
- */
-struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req)
-{
- struct sock *child = NULL;
- struct dccp_request_sock *dreq = dccp_rsk(req);
- bool own_req;
-
- /* TCP/DCCP listeners became lockless.
- * DCCP stores complex state in its request_sock, so we need
- * a protection for them, now this code runs without being protected
- * by the parent (listener) lock.
- */
- spin_lock_bh(&dreq->dreq_lock);
-
- /* Check for retransmitted REQUEST */
- if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
-
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
- dccp_pr_debug("Retransmitted REQUEST\n");
- dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
- /*
- * Send another RESPONSE packet
- * To protect against Request floods, increment retrans
- * counter (backoff, monitored by dccp_response_timer).
- */
- inet_rtx_syn_ack(sk, req);
- }
- /* Network Duplicate, discard packet */
- goto out;
- }
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
-
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
- dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
- goto drop;
-
- /* Invalid ACK */
- if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dreq->dreq_iss, dreq->dreq_gss)) {
- dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu, dreq_gss=%llu\n",
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dreq->dreq_iss,
- (unsigned long long) dreq->dreq_gss);
- goto drop;
- }
-
- if (dccp_parse_options(sk, dreq, skb))
- goto drop;
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
- req, &own_req);
- if (child) {
- child = inet_csk_complete_hashdance(sk, child, req, own_req);
- goto out;
- }
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
-drop:
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
- req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
-
- inet_csk_reqsk_queue_drop(sk, req);
-out:
- spin_unlock_bh(&dreq->dreq_lock);
- return child;
-}
-
-EXPORT_SYMBOL_GPL(dccp_check_req);
-
-/*
- * Queue segment on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket.
- */
-int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
- __releases(child)
-{
- int ret = 0;
- const int state = child->sk_state;
-
- if (!sock_owned_by_user(child)) {
- ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
- skb->len);
-
- /* Wakeup parent, send SIGIO */
- if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent);
- } else {
- /* Alas, it is possible again, because we do lookup
- * in main socket hash table and lock on listening
- * socket does not protect us more.
- */
- __sk_add_backlog(child, skb);
- }
-
- bh_unlock_sock(child);
- sock_put(child);
- return ret;
-}
-
-EXPORT_SYMBOL_GPL(dccp_child_process);
-
-void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *rsk)
-{
- DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
-
-int dccp_reqsk_init(struct request_sock *req,
- struct dccp_sock const *dp, struct sk_buff const *skb)
-{
- struct dccp_request_sock *dreq = dccp_rsk(req);
-
- spin_lock_init(&dreq->dreq_lock);
- inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
- inet_rsk(req)->acked = 0;
- dreq->dreq_timestamp_echo = 0;
-
- /* inherit feature negotiation options from listening socket */
- return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
deleted file mode 100644
index db62d4767024..000000000000
--- a/net/dccp/options.c
+++ /dev/null
@@ -1,609 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/options.c
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- */
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/unaligned.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-u64 dccp_decode_value_var(const u8 *bf, const u8 len)
-{
- u64 value = 0;
-
- if (len >= DCCP_OPTVAL_MAXLEN)
- value += ((u64)*bf++) << 40;
- if (len > 4)
- value += ((u64)*bf++) << 32;
- if (len > 3)
- value += ((u64)*bf++) << 24;
- if (len > 2)
- value += ((u64)*bf++) << 16;
- if (len > 1)
- value += ((u64)*bf++) << 8;
- if (len > 0)
- value += *bf;
-
- return value;
-}
-
-/**
- * dccp_parse_options - Parse DCCP options present in @skb
- * @sk: client|server|listening dccp socket (when @dreq != NULL)
- * @dreq: request socket to use during connection setup, or NULL
- * @skb: frame to parse
- */
-int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr = options;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- struct dccp_options_received *opt_recv = &dp->dccps_options_received;
- unsigned char opt, len;
- unsigned char *value;
- u32 elapsed_time;
- __be32 opt_val;
- int rc;
- int mandatory = 0;
-
- memset(opt_recv, 0, sizeof(*opt_recv));
-
- opt = len = 0;
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_nonsensical_length;
-
- len = *opt_ptr++;
- if (len < 2)
- goto out_nonsensical_length;
- /*
- * Remove the type and len fields, leaving
- * just the value size
- */
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
-
- if (opt_ptr > opt_end)
- goto out_nonsensical_length;
- }
-
- /*
- * CCID-specific options are ignored during connection setup, as
- * negotiation may still be in progress (see RFC 4340, 10.3).
- * The same applies to Ack Vectors, as these depend on the CCID.
- */
- if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
- opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
- goto ignore_option;
-
- switch (opt) {
- case DCCPO_PADDING:
- break;
- case DCCPO_MANDATORY:
- if (mandatory)
- goto out_invalid_option;
- if (pkt_type != DCCP_PKT_DATA)
- mandatory = 1;
- break;
- case DCCPO_NDP_COUNT:
- if (len > 6)
- goto out_invalid_option;
-
- opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
- dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
- (unsigned long long)opt_recv->dccpor_ndp);
- break;
- case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
- if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
- break;
- if (len == 0)
- goto out_invalid_option;
- rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
- *value, value + 1, len - 1);
- if (rc)
- goto out_featneg_failed;
- break;
- case DCCPO_TIMESTAMP:
- if (len != 4)
- goto out_invalid_option;
- /*
- * RFC 4340 13.1: "The precise time corresponding to
- * Timestamp Value zero is not specified". We use
- * zero to indicate absence of a meaningful timestamp.
- */
- opt_val = get_unaligned((__be32 *)value);
- if (unlikely(opt_val == 0)) {
- DCCP_WARN("Timestamp with zero value\n");
- break;
- }
-
- if (dreq != NULL) {
- dreq->dreq_timestamp_echo = ntohl(opt_val);
- dreq->dreq_timestamp_time = dccp_timestamp();
- } else {
- opt_recv->dccpor_timestamp =
- dp->dccps_timestamp_echo = ntohl(opt_val);
- dp->dccps_timestamp_time = dccp_timestamp();
- }
- dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
- dccp_role(sk), ntohl(opt_val),
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- /* schedule an Ack in case this sender is quiescent */
- inet_csk_schedule_ack(sk);
- break;
- case DCCPO_TIMESTAMP_ECHO:
- if (len != 4 && len != 6 && len != 8)
- goto out_invalid_option;
-
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
-
- dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
- "ackno=%llu", dccp_role(sk),
- opt_recv->dccpor_timestamp_echo,
- len + 2,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
-
- value += 4;
-
- if (len == 4) { /* no elapsed time included */
- dccp_pr_debug_cat("\n");
- break;
- }
-
- if (len == 6) { /* 2-byte elapsed time */
- __be16 opt_val2 = get_unaligned((__be16 *)value);
- elapsed_time = ntohs(opt_val2);
- } else { /* 4-byte elapsed time */
- opt_val = get_unaligned((__be32 *)value);
- elapsed_time = ntohl(opt_val);
- }
-
- dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
-
- /* Give precedence to the biggest ELAPSED_TIME */
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
- break;
- case DCCPO_ELAPSED_TIME:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
- break;
-
- if (len == 2) {
- __be16 opt_val2 = get_unaligned((__be16 *)value);
- elapsed_time = ntohs(opt_val2);
- } else if (len == 4) {
- opt_val = get_unaligned((__be32 *)value);
- elapsed_time = ntohl(opt_val);
- } else {
- goto out_invalid_option;
- }
-
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
-
- dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
- dccp_role(sk), elapsed_time);
- break;
- case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
- if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- pkt_type, opt, value, len))
- goto out_invalid_option;
- break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
- break;
- /*
- * Ack vectors are processed by the TX CCID if it is
- * interested. The RX CCID need not parse Ack Vectors,
- * since it is only interested in clearing old state.
- */
- fallthrough;
- case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
- if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- pkt_type, opt, value, len))
- goto out_invalid_option;
- break;
- default:
- DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
- "implemented, ignoring", sk, opt, len);
- break;
- }
-ignore_option:
- if (opt != DCCPO_MANDATORY)
- mandatory = 0;
- }
-
- /* mandatory was the last byte in option list -> reset connection */
- if (mandatory)
- goto out_invalid_option;
-
-out_nonsensical_length:
- /* RFC 4340, 5.8: ignore option and all remaining option space */
- return 0;
-
-out_invalid_option:
- DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
- rc = DCCP_RESET_CODE_OPTION_ERROR;
-out_featneg_failed:
- DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
- DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
- DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
- DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
- DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
- return -1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_parse_options);
-
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
-{
- if (len >= DCCP_OPTVAL_MAXLEN)
- *to++ = (value & 0xFF0000000000ull) >> 40;
- if (len > 4)
- *to++ = (value & 0xFF00000000ull) >> 32;
- if (len > 3)
- *to++ = (value & 0xFF000000) >> 24;
- if (len > 2)
- *to++ = (value & 0xFF0000) >> 16;
- if (len > 1)
- *to++ = (value & 0xFF00) >> 8;
- if (len > 0)
- *to++ = (value & 0xFF);
-}
-
-static inline u8 dccp_ndp_len(const u64 ndp)
-{
- if (likely(ndp <= 0xFF))
- return 1;
- return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
-}
-
-int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
- const void *value, const unsigned char len)
-{
- unsigned char *to;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
-
- to = skb_push(skb, len + 2);
- *to++ = option;
- *to++ = len + 2;
-
- memcpy(to, value, len);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option);
-
-static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- u64 ndp = dp->dccps_ndp_count;
-
- if (dccp_non_data_packet(skb))
- ++dp->dccps_ndp_count;
- else
- dp->dccps_ndp_count = 0;
-
- if (ndp > 0) {
- unsigned char *ptr;
- const int ndp_len = dccp_ndp_len(ndp);
- const int len = ndp_len + 2;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- ptr = skb_push(skb, len);
- *ptr++ = DCCPO_NDP_COUNT;
- *ptr++ = len;
- dccp_encode_value_var(ndp, ptr, ndp_len);
- }
-
- return 0;
-}
-
-static inline int dccp_elapsed_time_len(const u32 elapsed_time)
-{
- return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
-}
-
-static int dccp_insert_option_timestamp(struct sk_buff *skb)
-{
- __be32 now = htonl(dccp_timestamp());
- /* yes this will overflow but that is the point as we want a
- * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
-
- return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
-}
-
-static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
- struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- __be32 tstamp_echo;
- unsigned char *to;
- u32 elapsed_time, elapsed_time_len, len;
-
- if (dreq != NULL) {
- elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
- tstamp_echo = htonl(dreq->dreq_timestamp_echo);
- dreq->dreq_timestamp_echo = 0;
- } else {
- elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
- tstamp_echo = htonl(dp->dccps_timestamp_echo);
- dp->dccps_timestamp_echo = 0;
- }
-
- elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- len = 6 + elapsed_time_len;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_TIMESTAMP_ECHO;
- *to++ = len;
-
- memcpy(to, &tstamp_echo, 4);
- to += 4;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else if (elapsed_time_len == 4) {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- return 0;
-}
-
-static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const u16 buflen = dccp_ackvec_buflen(av);
- /* Figure out how many options do we need to represent the ackvec */
- const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
- u16 len = buflen + 2 * nr_opts;
- u8 i, nonce = 0;
- const unsigned char *tail, *from;
- unsigned char *to;
-
- if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
- dccp_packet_name(dcb->dccpd_type));
- return -1;
- }
- /*
- * Since Ack Vectors are variable-length, we can not always predict
- * their size. To catch exception cases where the space is running out
- * on the skb, a separate Sync is scheduled to carry the Ack Vector.
- */
- if (len > DCCPAV_MIN_OPTLEN &&
- len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
- DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
- "MPS=%u ==> reduce payload size?\n", len, skb->len,
- dcb->dccpd_opt_len, dp->dccps_mss_cache);
- dp->dccps_sync_scheduled = 1;
- return 0;
- }
- dcb->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- len = buflen;
- from = av->av_buf + av->av_buf_head;
- tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
-
- for (i = 0; i < nr_opts; ++i) {
- int copylen = len;
-
- if (len > DCCP_SINGLE_OPT_MAXLEN)
- copylen = DCCP_SINGLE_OPT_MAXLEN;
-
- /*
- * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
- * its type; ack_nonce is the sum of all individual buf_nonce's.
- */
- nonce ^= av->av_buf_nonce[i];
-
- *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
- *to++ = copylen + 2;
-
- /* Check if buf_head wraps */
- if (from + copylen > tail) {
- const u16 tailsize = tail - from;
-
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- copylen -= tailsize;
- from = av->av_buf;
- }
-
- memcpy(to, from, copylen);
- from += copylen;
- to += copylen;
- len -= copylen;
- }
- /*
- * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
- */
- if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
- return -ENOBUFS;
- return 0;
-}
-
-/**
- * dccp_insert_option_mandatory - Mandatory option (5.8.2)
- * @skb: frame into which to insert option
- *
- * Note that since we are using skb_push, this function needs to be called
- * _after_ inserting the option it is supposed to influence (stack order).
- */
-int dccp_insert_option_mandatory(struct sk_buff *skb)
-{
- if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len++;
- *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY;
- return 0;
-}
-
-/**
- * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
- * @skb: frame to insert feature negotiation option into
- * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
- * @feat: one out of %dccp_feature_numbers
- * @val: NN value or SP array (preferred element first) to copy
- * @len: true length of @val in bytes (excluding first element repetition)
- * @repeat_first: whether to copy the first element of @val twice
- *
- * The last argument is used to construct Confirm options, where the preferred
- * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
- * lists are kept such that the preferred entry is always first, so we only need
- * to copy twice, and avoid the overhead of cloning into a bigger array.
- */
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
- u8 *val, u8 len, bool repeat_first)
-{
- u8 tot_len, *to;
-
- /* take the `Feature' field and possible repetition into account */
- if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
- DCCP_WARN("length %u for feature %u too large\n", len, feat);
- return -1;
- }
-
- if (unlikely(val == NULL || len == 0))
- len = repeat_first = false;
- tot_len = 3 + repeat_first + len;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("packet too small for feature %d option!\n", feat);
- return -1;
- }
- DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
-
- to = skb_push(skb, tot_len);
- *to++ = type;
- *to++ = tot_len;
- *to++ = feat;
-
- if (repeat_first)
- *to++ = *val;
- if (len)
- memcpy(to, val, len);
- return 0;
-}
-
-/* The length of all options needs to be a multiple of 4 (5.8) */
-static void dccp_insert_option_padding(struct sk_buff *skb)
-{
- int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
-
- if (padding != 0) {
- padding = 4 - padding;
- memset(skb_push(skb, padding), 0, padding);
- DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
- }
-}
-
-int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
-
- if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
- return -1;
-
- if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
-
- /* Feature Negotiation */
- if (dccp_feat_insert_opts(dp, NULL, skb))
- return -1;
-
- if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
- /*
- * Obtain RTT sample from Request/Response exchange.
- * This is currently used for TFRC initialisation.
- */
- if (dccp_insert_option_timestamp(skb))
- return -1;
-
- } else if (dccp_ackvec_pending(sk) &&
- dccp_insert_option_ackvec(sk, skb)) {
- return -1;
- }
- }
-
- if (dp->dccps_hc_rx_insert_options) {
- if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
- return -1;
- dp->dccps_hc_rx_insert_options = 0;
- }
-
- if (dp->dccps_timestamp_echo != 0 &&
- dccp_insert_option_timestamp_echo(dp, NULL, skb))
- return -1;
-
- dccp_insert_option_padding(skb);
- return 0;
-}
-
-int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
-{
- DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
-
- if (dccp_feat_insert_opts(NULL, dreq, skb))
- return -1;
-
- /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
- if (dccp_insert_option_timestamp(skb))
- return -1;
-
- if (dreq->dreq_timestamp_echo != 0 &&
- dccp_insert_option_timestamp_echo(NULL, dreq, skb))
- return -1;
-
- dccp_insert_option_padding(skb);
- return 0;
-}
diff --git a/net/dccp/output.c b/net/dccp/output.c
deleted file mode 100644
index 39cf3430177a..000000000000
--- a/net/dccp/output.c
+++ /dev/null
@@ -1,708 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/output.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/sched/signal.h>
-
-#include <net/inet_sock.h>
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-static inline void dccp_event_ack_sent(struct sock *sk)
-{
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-}
-
-/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
-static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
-{
- skb_set_owner_w(skb, sk);
- WARN_ON(sk->sk_send_head);
- sk->sk_send_head = skb;
- return skb_clone(sk->sk_send_head, gfp_any());
-}
-
-/*
- * All SKB's seen here are completely headerless. It is our
- * job to build the DCCP header, and pass the packet down to
- * IP so it can do the same plus pass the packet off to the
- * device.
- */
-static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
-{
- if (likely(skb != NULL)) {
- struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- struct dccp_hdr *dh;
- /* XXX For now we're using only 48 bits sequence numbers */
- const u32 dccp_header_size = sizeof(*dh) +
- sizeof(struct dccp_hdr_ext) +
- dccp_packet_hdr_len(dcb->dccpd_type);
- int err, set_ack = 1;
- u64 ackno = dp->dccps_gsr;
- /*
- * Increment GSS here already in case the option code needs it.
- * Update GSS for real only if option processing below succeeds.
- */
- dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_DATA:
- set_ack = 0;
- fallthrough;
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_RESET:
- break;
-
- case DCCP_PKT_REQUEST:
- set_ack = 0;
- /* Use ISS on the first (non-retransmitted) Request. */
- if (icsk->icsk_retransmits == 0)
- dcb->dccpd_seq = dp->dccps_iss;
- fallthrough;
-
- case DCCP_PKT_SYNC:
- case DCCP_PKT_SYNCACK:
- ackno = dcb->dccpd_ack_seq;
- fallthrough;
- default:
- /*
- * Set owner/destructor: some skbs are allocated via
- * alloc_skb (e.g. when retransmission may happen).
- * Only Data, DataAck, and Reset packets should come
- * through here with skb->sk set.
- */
- WARN_ON(skb->sk);
- skb_set_owner_w(skb, sk);
- break;
- }
-
- if (dccp_insert_options(sk, skb)) {
- kfree_skb(skb);
- return -EPROTO;
- }
-
-
- /* Build DCCP header and checksum it. */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_type = dcb->dccpd_type;
- dh->dccph_sport = inet->inet_sport;
- dh->dccph_dport = inet->inet_dport;
- dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
- dh->dccph_ccval = dcb->dccpd_ccval;
- dh->dccph_cscov = dp->dccps_pcslen;
- /* XXX For now we're using only 48 bits sequence numbers */
- dh->dccph_x = 1;
-
- dccp_update_gss(sk, dcb->dccpd_seq);
- dccp_hdr_set_seq(dh, dp->dccps_gss);
- if (set_ack)
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_REQUEST:
- dccp_hdr_request(skb)->dccph_req_service =
- dp->dccps_service;
- /*
- * Limit Ack window to ISS <= P.ackno <= GSS, so that
- * only Responses to Requests we sent are considered.
- */
- dp->dccps_awl = dp->dccps_iss;
- break;
- case DCCP_PKT_RESET:
- dccp_hdr_reset(skb)->dccph_reset_code =
- dcb->dccpd_reset_code;
- break;
- }
-
- icsk->icsk_af_ops->send_check(sk, skb);
-
- if (set_ack)
- dccp_event_ack_sent(sk);
-
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
-
- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
- return net_xmit_eval(err);
- }
- return -ENOBUFS;
-}
-
-/**
- * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
- * @dp: socket to find packet size limits of
- *
- * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
- * since the RX CCID is restricted to feedback packets (Acks), which are small
- * in comparison with the data traffic. A value of 0 means "no current CCMPS".
- */
-static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
-{
- const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
-
- if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
- return 0;
- return tx_ccid->ccid_ops->ccid_ccmps;
-}
-
-unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- u32 ccmps = dccp_determine_ccmps(dp);
- u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
-
- /* Account for header lengths and IPv4/v6 option overhead */
- cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
- sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
-
- /*
- * Leave enough headroom for common DCCP header options.
- * This only considers options which may appear on DCCP-Data packets, as
- * per table 3 in RFC 4340, 5.8. When running out of space for other
- * options (eg. Ack Vector which can take up to 255 bytes), it is better
- * to schedule a separate Ack. Thus we leave headroom for the following:
- * - 1 byte for Slow Receiver (11.6)
- * - 6 bytes for Timestamp (13.1)
- * - 10 bytes for Timestamp Echo (13.3)
- * - 8 bytes for NDP count (7.7, when activated)
- * - 6 bytes for Data Checksum (9.3)
- * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
- */
- cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
- (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
-
- /* And store cached results */
- icsk->icsk_pmtu_cookie = pmtu;
- WRITE_ONCE(dp->dccps_mss_cache, cur_mps);
-
- return cur_mps;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sync_mss);
-
-void dccp_write_space(struct sock *sk)
-{
- struct socket_wq *wq;
-
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible(&wq->wait);
- /* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
-
- rcu_read_unlock();
-}
-
-/**
- * dccp_wait_for_ccid - Await CCID send permission
- * @sk: socket to wait for
- * @delay: timeout in jiffies
- *
- * This is used by CCIDs which need to delay the send time in process context.
- */
-static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
-{
- DEFINE_WAIT(wait);
- long remaining;
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- sk->sk_write_pending++;
- release_sock(sk);
-
- remaining = schedule_timeout(delay);
-
- lock_sock(sk);
- sk->sk_write_pending--;
- finish_wait(sk_sleep(sk), &wait);
-
- if (signal_pending(current) || sk->sk_err)
- return -1;
- return remaining;
-}
-
-/**
- * dccp_xmit_packet - Send data packet under control of CCID
- * @sk: socket to send data packet on
- *
- * Transmits next-queued payload and informs CCID to account for the packet.
- */
-static void dccp_xmit_packet(struct sock *sk)
-{
- int err, len;
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb = dccp_qpolicy_pop(sk);
-
- if (unlikely(skb == NULL))
- return;
- len = skb->len;
-
- if (sk->sk_state == DCCP_PARTOPEN) {
- const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
- /*
- * See 8.1.5 - Handshake Completion.
- *
- * For robustness we resend Confirm options until the client has
- * entered OPEN. During the initial feature negotiation, the MPS
- * is smaller than usual, reduced by the Change/Confirm options.
- */
- if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
- DCCP_WARN("Payload too large (%d) for featneg.\n", len);
- dccp_send_ack(sk);
- dccp_feat_list_purge(&dp->dccps_featneg);
- }
-
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk)) {
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
- } else {
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
- }
-
- err = dccp_transmit_skb(sk, skb);
- if (err)
- dccp_pr_debug("transmit_skb() returned err=%d\n", err);
- /*
- * Register this one as sent even if an error occurred. To the remote
- * end a local packet drop is indistinguishable from network loss, i.e.
- * any local drop will eventually be reported via receiver feedback.
- */
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
-
- /*
- * If the CCID needs to transfer additional header options out-of-band
- * (e.g. Ack Vectors or feature-negotiation options), it activates this
- * flag to schedule a Sync. The Sync will automatically incorporate all
- * currently pending header options, thus clearing the backlog.
- */
- if (dp->dccps_sync_scheduled)
- dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
-}
-
-/**
- * dccp_flush_write_queue - Drain queue at end of connection
- * @sk: socket to be drained
- * @time_budget: time allowed to drain the queue
- *
- * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
- * happen that the TX queue is not empty at the end of a connection. We give the
- * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
- * returns with a non-empty write queue, it will be purged later.
- */
-void dccp_flush_write_queue(struct sock *sk, long *time_budget)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- long delay, rc;
-
- while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- switch (ccid_packet_dequeue_eval(rc)) {
- case CCID_PACKET_WILL_DEQUEUE_LATER:
- /*
- * If the CCID determines when to send, the next sending
- * time is unknown or the CCID may not even send again
- * (e.g. remote host crashes or lost Ack packets).
- */
- DCCP_WARN("CCID did not manage to send all packets\n");
- return;
- case CCID_PACKET_DELAY:
- delay = msecs_to_jiffies(rc);
- if (delay > *time_budget)
- return;
- rc = dccp_wait_for_ccid(sk, delay);
- if (rc < 0)
- return;
- *time_budget -= (delay - rc);
- /* check again if we can send now */
- break;
- case CCID_PACKET_SEND_AT_ONCE:
- dccp_xmit_packet(sk);
- break;
- case CCID_PACKET_ERR:
- skb_dequeue(&sk->sk_write_queue);
- kfree_skb(skb);
- dccp_pr_debug("packet discarded due to err=%ld\n", rc);
- }
- }
-}
-
-void dccp_write_xmit(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
-
- while ((skb = dccp_qpolicy_top(sk))) {
- int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- switch (ccid_packet_dequeue_eval(rc)) {
- case CCID_PACKET_WILL_DEQUEUE_LATER:
- return;
- case CCID_PACKET_DELAY:
- sk_reset_timer(sk, &dp->dccps_xmit_timer,
- jiffies + msecs_to_jiffies(rc));
- return;
- case CCID_PACKET_SEND_AT_ONCE:
- dccp_xmit_packet(sk);
- break;
- case CCID_PACKET_ERR:
- dccp_qpolicy_drop(sk, skb);
- dccp_pr_debug("packet discarded due to err=%d\n", rc);
- }
- }
-}
-
-/**
- * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
- * @sk: socket to perform retransmit on
- *
- * There are only four retransmittable packet types in DCCP:
- * - Request in client-REQUEST state (sec. 8.1.1),
- * - CloseReq in server-CLOSEREQ state (sec. 8.3),
- * - Close in node-CLOSING state (sec. 8.3),
- * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
- * This function expects sk->sk_send_head to contain the original skb.
- */
-int dccp_retransmit_skb(struct sock *sk)
-{
- WARN_ON(sk->sk_send_head == NULL);
-
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
- return -EHOSTUNREACH; /* Routing failure or similar. */
-
- /* this count is used to distinguish original and retransmitted skb */
- inet_csk(sk)->icsk_retransmits++;
-
- return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
-}
-
-struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
-{
- struct dccp_hdr *dh;
- struct dccp_request_sock *dreq;
- const u32 dccp_header_size = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_response);
- struct sk_buff *skb;
-
- /* sk is marked const to clearly express we dont hold socket lock.
- * sock_wmalloc() will atomically change sk->sk_wmem_alloc,
- * it is safe to promote sk to non const.
- */
- skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1,
- GFP_ATOMIC);
- if (!skb)
- return NULL;
-
- skb_reserve(skb, MAX_DCCP_HEADER);
-
- skb_dst_set(skb, dst_clone(dst));
-
- dreq = dccp_rsk(req);
- if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
- dccp_inc_seqno(&dreq->dreq_gss);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
-
- /* Resolve feature dependencies resulting from choice of CCID */
- if (dccp_feat_server_ccid_dependencies(dreq))
- goto response_failed;
-
- if (dccp_insert_options_rsk(dreq, skb))
- goto response_failed;
-
- /* Build and checksum header */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
-
- dh->dccph_sport = htons(inet_rsk(req)->ir_num);
- dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
- dh->dccph_doff = (dccp_header_size +
- DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
- dh->dccph_type = DCCP_PKT_RESPONSE;
- dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_gss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
- dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
-
- dccp_csum_outgoing(skb);
-
- /* We use `acked' to remember that a Response was already sent. */
- inet_rsk(req)->acked = 1;
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- return skb;
-response_failed:
- kfree_skb(skb);
- return NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_make_response);
-
-/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
-struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
-{
- struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
- const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct dccp_hdr_reset *dhr;
- struct sk_buff *skb;
-
- skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- skb_reserve(skb, sk->sk_prot->max_header);
-
- /* Swap the send and the receive. */
- dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
-
- dhr = dccp_hdr_reset(skb);
- dhr->dccph_reset_code = dcb->dccpd_reset_code;
-
- switch (dcb->dccpd_reset_code) {
- case DCCP_RESET_CODE_PACKET_ERROR:
- dhr->dccph_reset_data[0] = rxdh->dccph_type;
- break;
- case DCCP_RESET_CODE_OPTION_ERROR:
- case DCCP_RESET_CODE_MANDATORY_ERROR:
- memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
- break;
- }
- /*
- * From RFC 4340, 8.3.1:
- * If P.ackno exists, set R.seqno := P.ackno + 1.
- * Else set R.seqno := 0.
- */
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
-
- dccp_csum_outgoing(skb);
- return skb;
-}
-
-EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
-
-/* send Reset on established socket, to close or abort the connection */
-int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
-{
- struct sk_buff *skb;
- /*
- * FIXME: what if rebuild_header fails?
- * Should we be doing a rebuild_header here?
- */
- int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
-
- if (err != 0)
- return err;
-
- skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
- if (skb == NULL)
- return -ENOBUFS;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
- DCCP_SKB_CB(skb)->dccpd_reset_code = code;
-
- return dccp_transmit_skb(sk, skb);
-}
-
-/*
- * Do all connect socket setups that can be done AF independent.
- */
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct dccp_sock *dp = dccp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- sk->sk_err = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- dccp_sync_mss(sk, dst_mtu(dst));
-
- /* do not connect if feature negotiation setup fails */
- if (dccp_feat_finalise_settings(dccp_sk(sk)))
- return -EPROTO;
-
- /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
- dp->dccps_gar = dp->dccps_iss;
-
- skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
- if (unlikely(skb == NULL))
- return -ENOBUFS;
-
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
-
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
-
- dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
- DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
-
- /* Timer for repeating the REQUEST until an answer. */
- icsk->icsk_retransmits = 0;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, DCCP_RTO_MAX);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_connect);
-
-void dccp_send_ack(struct sock *sk)
-{
- /* If we have been reset, we may not send again. */
- if (sk->sk_state != DCCP_CLOSED) {
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
- GFP_ATOMIC);
-
- if (skb == NULL) {
- inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX,
- DCCP_RTO_MAX);
- return;
- }
-
- /* Reserve space for headers */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
- dccp_transmit_skb(sk, skb);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_ack);
-
-#if 0
-/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
-void dccp_send_delayed_ack(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- /*
- * FIXME: tune this timer. elapsed time fixes the skew, so no problem
- * with using 2s, and active senders also piggyback the ACK into a
- * DATAACK packet, so this is really for quiescent senders.
- */
- unsigned long timeout = jiffies + 2 * HZ;
-
- /* Use new timeout only if there wasn't a older one earlier. */
- if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- *
- * FIXME: check the "about to expire" part
- */
- if (icsk->icsk_ack.blocked) {
- dccp_send_ack(sk);
- return;
- }
-
- if (!time_before(timeout, icsk_delack_timeout(icsk)))
- timeout = icsk_delack_timeout(icsk);
- }
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
-}
-#endif
-
-void dccp_send_sync(struct sock *sk, const u64 ackno,
- const enum dccp_pkt_type pkt_type)
-{
- /*
- * We are not putting this on the write queue, so
- * dccp_transmit_skb() will set the ownership to this
- * sock.
- */
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
-
- if (skb == NULL) {
- /* FIXME: how to make sure the sync is sent? */
- DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
- return;
- }
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
- DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
-
- /*
- * Clear the flag in case the Sync was scheduled for out-of-band data,
- * such as carrying a long Ack Vector.
- */
- dccp_sk(sk)->dccps_sync_scheduled = 0;
-
- dccp_transmit_skb(sk, skb);
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_sync);
-
-/*
- * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
- * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
- * any circumstances.
- */
-void dccp_send_close(struct sock *sk, const int active)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
-
- skb = alloc_skb(sk->sk_prot->max_header, prio);
- if (skb == NULL)
- return;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
- else
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
-
- if (active) {
- skb = dccp_skb_entail(sk, skb);
- /*
- * Retransmission timer for active-close: RFC 4340, 8.3 requires
- * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
- * state can be left. The initial timeout is 2 RTTs.
- * Since RTT measurement is done by the CCIDs, there is no easy
- * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
- * is too low (200ms); we use a high value to avoid unnecessary
- * retransmissions when the link RTT is > 0.2 seconds.
- * FIXME: Let main module sample RTTs and use that instead.
- */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
- }
- dccp_transmit_skb(sk, skb);
-}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
deleted file mode 100644
index fcc5c9d64f46..000000000000
--- a/net/dccp/proto.c
+++ /dev/null
@@ -1,1293 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/proto.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-
-#include <net/inet_sock.h>
-#include <net/inet_common.h>
-#include <net/sock.h>
-#include <net/xfrm.h>
-
-#include <asm/ioctls.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/poll.h>
-
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
-
-EXPORT_SYMBOL_GPL(dccp_statistics);
-
-DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
-EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);
-
-struct inet_hashinfo dccp_hashinfo;
-EXPORT_SYMBOL_GPL(dccp_hashinfo);
-
-/* the maximum queue length for tx in packets. 0 is no limit */
-int sysctl_dccp_tx_qlen __read_mostly = 5;
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_state_name(const int state)
-{
- static const char *const dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
- [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-#endif
-
-void dccp_set_state(struct sock *sk, const int state)
-{
- const int oldstate = sk->sk_state;
-
- dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk,
- dccp_state_name(oldstate), dccp_state_name(state));
- WARN_ON(state == oldstate);
-
- switch (state) {
- case DCCP_OPEN:
- if (oldstate != DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
- /* Client retransmits all Confirm options until entering OPEN */
- if (oldstate == DCCP_PARTOPEN)
- dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
- break;
-
- case DCCP_CLOSED:
- if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
- oldstate == DCCP_CLOSING)
- DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
-
- sk->sk_prot->unhash(sk);
- if (inet_csk(sk)->icsk_bind_hash != NULL &&
- !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
- inet_put_port(sk);
- fallthrough;
- default:
- if (oldstate == DCCP_OPEN)
- DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
- }
-
- /* Change state AFTER socket is unhashed to avoid closed
- * socket sitting in hash tables.
- */
- inet_sk_set_state(sk, state);
-}
-
-EXPORT_SYMBOL_GPL(dccp_set_state);
-
-static void dccp_finish_passive_close(struct sock *sk)
-{
- switch (sk->sk_state) {
- case DCCP_PASSIVE_CLOSE:
- /* Node (client or server) has received Close packet. */
- dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_set_state(sk, DCCP_CLOSED);
- break;
- case DCCP_PASSIVE_CLOSEREQ:
- /*
- * Client received CloseReq. We set the `active' flag so that
- * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
- */
- dccp_send_close(sk, 1);
- dccp_set_state(sk, DCCP_CLOSING);
- }
-}
-
-void dccp_done(struct sock *sk)
-{
- dccp_set_state(sk, DCCP_CLOSED);
- dccp_clear_xmit_timers(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- else
- inet_csk_destroy_sock(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_done);
-
-const char *dccp_packet_name(const int type)
-{
- static const char *const dccp_packet_names[] = {
- [DCCP_PKT_REQUEST] = "REQUEST",
- [DCCP_PKT_RESPONSE] = "RESPONSE",
- [DCCP_PKT_DATA] = "DATA",
- [DCCP_PKT_ACK] = "ACK",
- [DCCP_PKT_DATAACK] = "DATAACK",
- [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PKT_CLOSE] = "CLOSE",
- [DCCP_PKT_RESET] = "RESET",
- [DCCP_PKT_SYNC] = "SYNC",
- [DCCP_PKT_SYNCACK] = "SYNCACK",
- };
-
- if (type >= DCCP_NR_PKT_TYPES)
- return "INVALID";
- else
- return dccp_packet_names[type];
-}
-
-EXPORT_SYMBOL_GPL(dccp_packet_name);
-
-void dccp_destruct_common(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_tx_ccid = NULL;
-}
-EXPORT_SYMBOL_GPL(dccp_destruct_common);
-
-static void dccp_sk_destruct(struct sock *sk)
-{
- dccp_destruct_common(sk);
- inet_sock_destruct(sk);
-}
-
-int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, "
- "please contact the netdev mailing list\n");
-
- icsk->icsk_rto = DCCP_TIMEOUT_INIT;
- icsk->icsk_syn_retries = sysctl_dccp_request_retries;
- sk->sk_state = DCCP_CLOSED;
- sk->sk_write_space = dccp_write_space;
- sk->sk_destruct = dccp_sk_destruct;
- icsk->icsk_sync_mss = dccp_sync_mss;
- dp->dccps_mss_cache = 536;
- dp->dccps_rate_last = jiffies;
- dp->dccps_role = DCCP_ROLE_UNDEFINED;
- dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
-
- dccp_init_xmit_timers(sk);
-
- INIT_LIST_HEAD(&dp->dccps_featneg);
- /* control socket doesn't need feat nego */
- if (likely(ctl_sock_initialized))
- return dccp_feat_init(sk);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_init_sock);
-
-void dccp_destroy_sock(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- __skb_queue_purge(&sk->sk_write_queue);
- if (sk->sk_send_head != NULL) {
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- /* Clean up a referenced DCCP bind bucket. */
- if (inet_csk(sk)->icsk_bind_hash != NULL)
- inet_put_port(sk);
-
- kfree(dp->dccps_service_list);
- dp->dccps_service_list = NULL;
-
- if (dp->dccps_hc_rx_ackvec != NULL) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = NULL;
-
- /* clean up feature negotiation state */
- dccp_feat_list_purge(&dp->dccps_featneg);
-}
-
-EXPORT_SYMBOL_GPL(dccp_destroy_sock);
-
-static inline int dccp_need_reset(int state)
-{
- return state != DCCP_CLOSED && state != DCCP_LISTEN &&
- state != DCCP_REQUESTING;
-}
-
-int dccp_disconnect(struct sock *sk, int flags)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- const int old_state = sk->sk_state;
-
- if (old_state != DCCP_CLOSED)
- dccp_set_state(sk, DCCP_CLOSED);
-
- /*
- * This corresponds to the ABORT function of RFC793, sec. 3.8
- * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
- */
- if (old_state == DCCP_LISTEN) {
- inet_csk_listen_stop(sk);
- } else if (dccp_need_reset(old_state)) {
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- sk->sk_err = ECONNRESET;
- } else if (old_state == DCCP_REQUESTING)
- sk->sk_err = ECONNRESET;
-
- dccp_clear_xmit_timers(sk);
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = NULL;
-
- __skb_queue_purge(&sk->sk_receive_queue);
- __skb_queue_purge(&sk->sk_write_queue);
- if (sk->sk_send_head != NULL) {
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- inet->inet_dport = 0;
-
- inet_bhash2_reset_saddr(sk);
-
- sk->sk_shutdown = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- icsk->icsk_backoff = 0;
- inet_csk_delack_init(sk);
- __sk_dst_reset(sk);
-
- WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
-
- sk_error_report(sk);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_disconnect);
-
-/*
- * Wait for a DCCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
- */
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- struct sock *sk = sock->sk;
- __poll_t mask;
- u8 shutdown;
- int state;
-
- sock_poll_wait(file, sock, wait);
-
- state = inet_sk_state_load(sk);
- if (state == DCCP_LISTEN)
- return inet_csk_listen_poll(sk);
-
- /* Socket is not locked. We are protected from async events
- by poll logic and correct handling of state changes
- made by another threads is impossible in any case.
- */
-
- mask = 0;
- if (READ_ONCE(sk->sk_err))
- mask = EPOLLERR;
- shutdown = READ_ONCE(sk->sk_shutdown);
-
- if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED)
- mask |= EPOLLHUP;
- if (shutdown & RCV_SHUTDOWN)
- mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
-
- /* Connected? */
- if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
- if (atomic_read(&sk->sk_rmem_alloc) > 0)
- mask |= EPOLLIN | EPOLLRDNORM;
-
- if (!(shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
- mask |= EPOLLOUT | EPOLLWRNORM;
- } else { /* send SIGIO later */
- sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* Race breaker. If space is freed after
- * wspace test but before the flags are set,
- * IO signal will be lost.
- */
- if (sk_stream_is_writeable(sk))
- mask |= EPOLLOUT | EPOLLWRNORM;
- }
- }
- }
- return mask;
-}
-EXPORT_SYMBOL_GPL(dccp_poll);
-
-int dccp_ioctl(struct sock *sk, int cmd, int *karg)
-{
- int rc = -ENOTCONN;
-
- lock_sock(sk);
-
- if (sk->sk_state == DCCP_LISTEN)
- goto out;
-
- switch (cmd) {
- case SIOCOUTQ: {
- *karg = sk_wmem_alloc_get(sk);
- /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and
- * always 0, comparably to UDP.
- */
-
- rc = 0;
- }
- break;
- case SIOCINQ: {
- struct sk_buff *skb;
- *karg = 0;
-
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL) {
- /*
- * We will only return the amount of this packet since
- * that is all that will be read.
- */
- *karg = skb->len;
- }
- rc = 0;
- }
- break;
- default:
- rc = -ENOIOCTLCMD;
- break;
- }
-out:
- release_sock(sk);
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(dccp_ioctl);
-
-static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
- sockptr_t optval, unsigned int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_service_list *sl = NULL;
-
- if (service == DCCP_SERVICE_INVALID_VALUE ||
- optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
- return -EINVAL;
-
- if (optlen > sizeof(service)) {
- sl = kmalloc(optlen, GFP_KERNEL);
- if (sl == NULL)
- return -ENOMEM;
-
- sl->dccpsl_nr = optlen / sizeof(u32) - 1;
- if (copy_from_sockptr_offset(sl->dccpsl_list, optval,
- sizeof(service), optlen - sizeof(service)) ||
- dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
- kfree(sl);
- return -EFAULT;
- }
- }
-
- lock_sock(sk);
- dp->dccps_service = service;
-
- kfree(dp->dccps_service_list);
-
- dp->dccps_service_list = sl;
- release_sock(sk);
- return 0;
-}
-
-static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
-{
- u8 *list, len;
- int i, rc;
-
- if (cscov < 0 || cscov > 15)
- return -EINVAL;
- /*
- * Populate a list of permissible values, in the range cscov...15. This
- * is necessary since feature negotiation of single values only works if
- * both sides incidentally choose the same value. Since the list starts
- * lowest-value first, negotiation will pick the smallest shared value.
- */
- if (cscov == 0)
- return 0;
- len = 16 - cscov;
-
- list = kmalloc(len, GFP_KERNEL);
- if (list == NULL)
- return -ENOBUFS;
-
- for (i = 0; i < len; i++)
- list[i] = cscov++;
-
- rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
-
- if (rc == 0) {
- if (rx)
- dccp_sk(sk)->dccps_pcrlen = cscov;
- else
- dccp_sk(sk)->dccps_pcslen = cscov;
- }
- kfree(list);
- return rc;
-}
-
-static int dccp_setsockopt_ccid(struct sock *sk, int type,
- sockptr_t optval, unsigned int optlen)
-{
- u8 *val;
- int rc = 0;
-
- if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
- return -EINVAL;
-
- val = memdup_sockptr(optval, optlen);
- if (IS_ERR(val))
- return PTR_ERR(val);
-
- lock_sock(sk);
- if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
- rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
-
- if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
- rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
- release_sock(sk);
-
- kfree(val);
- return rc;
-}
-
-static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- int val, err = 0;
-
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_CHANGE_L:
- case DCCP_SOCKOPT_CHANGE_R:
- DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_CCID:
- case DCCP_SOCKOPT_RX_CCID:
- case DCCP_SOCKOPT_TX_CCID:
- return dccp_setsockopt_ccid(sk, optname, optval, optlen);
- }
-
- if (optlen < (int)sizeof(int))
- return -EINVAL;
-
- if (copy_from_sockptr(&val, optval, sizeof(int)))
- return -EFAULT;
-
- if (optname == DCCP_SOCKOPT_SERVICE)
- return dccp_setsockopt_service(sk, val, optval, optlen);
-
- lock_sock(sk);
- switch (optname) {
- case DCCP_SOCKOPT_SERVER_TIMEWAIT:
- if (dp->dccps_role != DCCP_ROLE_SERVER)
- err = -EOPNOTSUPP;
- else
- dp->dccps_server_timewait = (val != 0);
- break;
- case DCCP_SOCKOPT_SEND_CSCOV:
- err = dccp_setsockopt_cscov(sk, val, false);
- break;
- case DCCP_SOCKOPT_RECV_CSCOV:
- err = dccp_setsockopt_cscov(sk, val, true);
- break;
- case DCCP_SOCKOPT_QPOLICY_ID:
- if (sk->sk_state != DCCP_CLOSED)
- err = -EISCONN;
- else if (val < 0 || val >= DCCPQ_POLICY_MAX)
- err = -EINVAL;
- else
- dp->dccps_qpolicy = val;
- break;
- case DCCP_SOCKOPT_QPOLICY_TXQLEN:
- if (val < 0)
- err = -EINVAL;
- else
- dp->dccps_tx_qlen = val;
- break;
- default:
- err = -ENOPROTOOPT;
- break;
- }
- release_sock(sk);
-
- return err;
-}
-
-int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
- unsigned int optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_setsockopt);
-
-static int dccp_getsockopt_service(struct sock *sk, int len,
- __be32 __user *optval,
- int __user *optlen)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_service_list *sl;
- int err = -ENOENT, slen = 0, total_len = sizeof(u32);
-
- lock_sock(sk);
- if ((sl = dp->dccps_service_list) != NULL) {
- slen = sl->dccpsl_nr * sizeof(u32);
- total_len += slen;
- }
-
- err = -EINVAL;
- if (total_len > len)
- goto out;
-
- err = 0;
- if (put_user(total_len, optlen) ||
- put_user(dp->dccps_service, optval) ||
- (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
- err = -EFAULT;
-out:
- release_sock(sk);
- return err;
-}
-
-static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct dccp_sock *dp;
- int val, len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- if (len < (int)sizeof(int))
- return -EINVAL;
-
- dp = dccp_sk(sk);
-
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_SERVICE:
- return dccp_getsockopt_service(sk, len,
- (__be32 __user *)optval, optlen);
- case DCCP_SOCKOPT_GET_CUR_MPS:
- val = READ_ONCE(dp->dccps_mss_cache);
- break;
- case DCCP_SOCKOPT_AVAILABLE_CCIDS:
- return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
- case DCCP_SOCKOPT_TX_CCID:
- val = ccid_get_current_tx_ccid(dp);
- if (val < 0)
- return -ENOPROTOOPT;
- break;
- case DCCP_SOCKOPT_RX_CCID:
- val = ccid_get_current_rx_ccid(dp);
- if (val < 0)
- return -ENOPROTOOPT;
- break;
- case DCCP_SOCKOPT_SERVER_TIMEWAIT:
- val = dp->dccps_server_timewait;
- break;
- case DCCP_SOCKOPT_SEND_CSCOV:
- val = dp->dccps_pcslen;
- break;
- case DCCP_SOCKOPT_RECV_CSCOV:
- val = dp->dccps_pcrlen;
- break;
- case DCCP_SOCKOPT_QPOLICY_ID:
- val = dp->dccps_qpolicy;
- break;
- case DCCP_SOCKOPT_QPOLICY_TXQLEN:
- val = dp->dccps_tx_qlen;
- break;
- case 128 ... 191:
- return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- case 192 ... 255:
- return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- default:
- return -ENOPROTOOPT;
- }
-
- len = sizeof(val);
- if (put_user(len, optlen) || copy_to_user(optval, &val, len))
- return -EFAULT;
-
- return 0;
-}
-
-int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_getsockopt);
-
-static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
-{
- struct cmsghdr *cmsg;
-
- /*
- * Assign an (opaque) qpolicy priority value to skb->priority.
- *
- * We are overloading this skb field for use with the qpolicy subystem.
- * The skb->priority is normally used for the SO_PRIORITY option, which
- * is initialised from sk_priority. Since the assignment of sk_priority
- * to skb->priority happens later (on layer 3), we overload this field
- * for use with queueing priorities as long as the skb is on layer 4.
- * The default priority value (if nothing is set) is 0.
- */
- skb->priority = 0;
-
- for_each_cmsghdr(cmsg, msg) {
- if (!CMSG_OK(msg, cmsg))
- return -EINVAL;
-
- if (cmsg->cmsg_level != SOL_DCCP)
- continue;
-
- if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
- !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
- return -EINVAL;
-
- switch (cmsg->cmsg_type) {
- case DCCP_SCM_PRIORITY:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
- return -EINVAL;
- skb->priority = *(__u32 *)CMSG_DATA(cmsg);
- break;
- default:
- return -EINVAL;
- }
- }
- return 0;
-}
-
-int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const int flags = msg->msg_flags;
- const int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
- int rc, size;
- long timeo;
-
- trace_dccp_probe(sk, len);
-
- if (len > READ_ONCE(dp->dccps_mss_cache))
- return -EMSGSIZE;
-
- lock_sock(sk);
-
- timeo = sock_sndtimeo(sk, noblock);
-
- /*
- * We have to use sk_stream_wait_connect here to set sk_write_pending,
- * so that the trick in dccp_rcv_request_sent_state_process.
- */
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
- if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
- goto out_release;
-
- size = sk->sk_prot->max_header + len;
- release_sock(sk);
- skb = sock_alloc_send_skb(sk, size, noblock, &rc);
- lock_sock(sk);
- if (skb == NULL)
- goto out_release;
-
- if (dccp_qpolicy_full(sk)) {
- rc = -EAGAIN;
- goto out_discard;
- }
-
- if (sk->sk_state == DCCP_CLOSED) {
- rc = -ENOTCONN;
- goto out_discard;
- }
-
- /* We need to check dccps_mss_cache after socket is locked. */
- if (len > dp->dccps_mss_cache) {
- rc = -EMSGSIZE;
- goto out_discard;
- }
-
- skb_reserve(skb, sk->sk_prot->max_header);
- rc = memcpy_from_msg(skb_put(skb, len), msg, len);
- if (rc != 0)
- goto out_discard;
-
- rc = dccp_msghdr_parse(msg, skb);
- if (rc != 0)
- goto out_discard;
-
- dccp_qpolicy_push(sk, skb);
- /*
- * The xmit_timer is set if the TX CCID is rate-based and will expire
- * when congestion control permits to release further packets into the
- * network. Window-based CCIDs do not use this timer.
- */
- if (!timer_pending(&dp->dccps_xmit_timer))
- dccp_write_xmit(sk);
-out_release:
- release_sock(sk);
- return rc ? : len;
-out_discard:
- kfree_skb(skb);
- goto out_release;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sendmsg);
-
-int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
- int *addr_len)
-{
- const struct dccp_hdr *dh;
- long timeo;
-
- lock_sock(sk);
-
- if (sk->sk_state == DCCP_LISTEN) {
- len = -ENOTCONN;
- goto out;
- }
-
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- do {
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- if (skb == NULL)
- goto verify_sock_status;
-
- dh = dccp_hdr(skb);
-
- switch (dh->dccph_type) {
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- goto found_ok_skb;
-
- case DCCP_PKT_CLOSE:
- case DCCP_PKT_CLOSEREQ:
- if (!(flags & MSG_PEEK))
- dccp_finish_passive_close(sk);
- fallthrough;
- case DCCP_PKT_RESET:
- dccp_pr_debug("found fin (%s) ok!\n",
- dccp_packet_name(dh->dccph_type));
- len = 0;
- goto found_fin_ok;
- default:
- dccp_pr_debug("packet_type=%s\n",
- dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb);
- }
-verify_sock_status:
- if (sock_flag(sk, SOCK_DONE)) {
- len = 0;
- break;
- }
-
- if (sk->sk_err) {
- len = sock_error(sk);
- break;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- len = 0;
- break;
- }
-
- if (sk->sk_state == DCCP_CLOSED) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- len = -ENOTCONN;
- break;
- }
- len = 0;
- break;
- }
-
- if (!timeo) {
- len = -EAGAIN;
- break;
- }
-
- if (signal_pending(current)) {
- len = sock_intr_errno(timeo);
- break;
- }
-
- sk_wait_data(sk, &timeo, NULL);
- continue;
- found_ok_skb:
- if (len > skb->len)
- len = skb->len;
- else if (len < skb->len)
- msg->msg_flags |= MSG_TRUNC;
-
- if (skb_copy_datagram_msg(skb, 0, msg, len)) {
- /* Exception. Bailout! */
- len = -EFAULT;
- break;
- }
- if (flags & MSG_TRUNC)
- len = skb->len;
- found_fin_ok:
- if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
- break;
- } while (1);
-out:
- release_sock(sk);
- return len;
-}
-
-EXPORT_SYMBOL_GPL(dccp_recvmsg);
-
-int inet_dccp_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
-
- lock_sock(sk);
-
- err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
- goto out;
-
- old_state = sk->sk_state;
- if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
- goto out;
-
- WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
- /* Really, if the socket is already in listen state
- * we can only allow the backlog to be adjusted.
- */
- if (old_state != DCCP_LISTEN) {
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_role = DCCP_ROLE_LISTEN;
-
- /* do not start to listen if feature negotiation setup fails */
- if (dccp_feat_finalise_settings(dp)) {
- err = -EPROTO;
- goto out;
- }
-
- err = inet_csk_listen_start(sk);
- if (err)
- goto out;
- }
- err = 0;
-
-out:
- release_sock(sk);
- return err;
-}
-
-EXPORT_SYMBOL_GPL(inet_dccp_listen);
-
-static void dccp_terminate_connection(struct sock *sk)
-{
- u8 next_state = DCCP_CLOSED;
-
- switch (sk->sk_state) {
- case DCCP_PASSIVE_CLOSE:
- case DCCP_PASSIVE_CLOSEREQ:
- dccp_finish_passive_close(sk);
- break;
- case DCCP_PARTOPEN:
- dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- fallthrough;
- case DCCP_OPEN:
- dccp_send_close(sk, 1);
-
- if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
- !dccp_sk(sk)->dccps_server_timewait)
- next_state = DCCP_ACTIVE_CLOSEREQ;
- else
- next_state = DCCP_CLOSING;
- fallthrough;
- default:
- dccp_set_state(sk, next_state);
- }
-}
-
-void dccp_close(struct sock *sk, long timeout)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- u32 data_was_unread = 0;
- int state;
-
- lock_sock(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (sk->sk_state == DCCP_LISTEN) {
- dccp_set_state(sk, DCCP_CLOSED);
-
- /* Special case. */
- inet_csk_listen_stop(sk);
-
- goto adjudge_to_death;
- }
-
- sk_stop_timer(sk, &dp->dccps_xmit_timer);
-
- /*
- * We need to flush the recv. buffs. We do this only on the
- * descriptor close, not protocol-sourced closes, because the
- *reader process may not have drained the data yet!
- */
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- data_was_unread += skb->len;
- __kfree_skb(skb);
- }
-
- /* If socket has been already reset kill it. */
- if (sk->sk_state == DCCP_CLOSED)
- goto adjudge_to_death;
-
- if (data_was_unread) {
- /* Unread data was tossed, send an appropriate Reset Code */
- DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- dccp_set_state(sk, DCCP_CLOSED);
- } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
- /* Check zero linger _after_ checking for unread data. */
- sk->sk_prot->disconnect(sk, 0);
- } else if (sk->sk_state != DCCP_CLOSED) {
- /*
- * Normal connection termination. May need to wait if there are
- * still packets in the TX queue that are delayed by the CCID.
- */
- dccp_flush_write_queue(sk, &timeout);
- dccp_terminate_connection(sk);
- }
-
- /*
- * Flush write queue. This may be necessary in several cases:
- * - we have been closed by the peer but still have application data;
- * - abortive termination (unread data or zero linger time),
- * - normal termination but queue could not be flushed within time limit
- */
- __skb_queue_purge(&sk->sk_write_queue);
-
- sk_stream_wait_close(sk, timeout);
-
-adjudge_to_death:
- state = sk->sk_state;
- sock_hold(sk);
- sock_orphan(sk);
-
- /*
- * It is the last release_sock in its life. It will remove backlog.
- */
- release_sock(sk);
- /*
- * Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
- local_bh_disable();
- bh_lock_sock(sk);
- WARN_ON(sock_owned_by_user(sk));
-
- this_cpu_inc(dccp_orphan_count);
-
- /* Have we already been destroyed by a softirq or backlog? */
- if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
- goto out;
-
- if (sk->sk_state == DCCP_CLOSED)
- inet_csk_destroy_sock(sk);
-
- /* Otherwise, socket is reprieved until protocol close. */
-
-out:
- bh_unlock_sock(sk);
- local_bh_enable();
- sock_put(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_close);
-
-void dccp_shutdown(struct sock *sk, int how)
-{
- dccp_pr_debug("called shutdown(%x)\n", how);
-}
-
-EXPORT_SYMBOL_GPL(dccp_shutdown);
-
-static inline int __init dccp_mib_init(void)
-{
- dccp_statistics = alloc_percpu(struct dccp_mib);
- if (!dccp_statistics)
- return -ENOMEM;
- return 0;
-}
-
-static inline void dccp_mib_exit(void)
-{
- free_percpu(dccp_statistics);
-}
-
-static int thash_entries;
-module_param(thash_entries, int, 0444);
-MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-bool dccp_debug;
-module_param(dccp_debug, bool, 0644);
-MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
-
-EXPORT_SYMBOL_GPL(dccp_debug);
-#endif
-
-static int __init dccp_init(void)
-{
- unsigned long goal;
- unsigned long nr_pages = totalram_pages();
- int ehash_order, bhash_order, i;
- int rc;
-
- BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
- sizeof_field(struct sk_buff, cb));
- rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
- if (rc)
- goto out_fail;
- rc = -ENOBUFS;
- dccp_hashinfo.bind_bucket_cachep =
- kmem_cache_create("dccp_bind_bucket",
- sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
- if (!dccp_hashinfo.bind_bucket_cachep)
- goto out_free_hashinfo2;
- dccp_hashinfo.bind2_bucket_cachep =
- kmem_cache_create("dccp_bind2_bucket",
- sizeof(struct inet_bind2_bucket), 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
- if (!dccp_hashinfo.bind2_bucket_cachep)
- goto out_free_bind_bucket_cachep;
-
- /*
- * Size and allocate the main established and bind bucket
- * hash tables.
- *
- * The methodology is similar to that of the buffer cache.
- */
- if (nr_pages >= (128 * 1024))
- goal = nr_pages >> (21 - PAGE_SHIFT);
- else
- goal = nr_pages >> (23 - PAGE_SHIFT);
-
- if (thash_entries)
- goal = (thash_entries *
- sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
- for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
- ;
- do {
- unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
- sizeof(struct inet_ehash_bucket);
-
- while (hash_size & (hash_size - 1))
- hash_size--;
- dccp_hashinfo.ehash_mask = hash_size - 1;
- dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
- } while (!dccp_hashinfo.ehash && --ehash_order > 0);
-
- if (!dccp_hashinfo.ehash) {
- DCCP_CRIT("Failed to allocate DCCP established hash table");
- goto out_free_bind2_bucket_cachep;
- }
-
- for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
- INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
-
- if (inet_ehash_locks_alloc(&dccp_hashinfo))
- goto out_free_dccp_ehash;
-
- bhash_order = ehash_order;
-
- do {
- dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
- sizeof(struct inet_bind_hashbucket);
- if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
- bhash_order > 0)
- continue;
- dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
- } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
-
- if (!dccp_hashinfo.bhash) {
- DCCP_CRIT("Failed to allocate DCCP bind hash table");
- goto out_free_dccp_locks;
- }
-
- dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
-
- if (!dccp_hashinfo.bhash2) {
- DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
- goto out_free_dccp_bhash;
- }
-
- for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
- spin_lock_init(&dccp_hashinfo.bhash[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
- spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
- }
-
- dccp_hashinfo.pernet = false;
-
- rc = dccp_mib_init();
- if (rc)
- goto out_free_dccp_bhash2;
-
- rc = dccp_ackvec_init();
- if (rc)
- goto out_free_dccp_mib;
-
- rc = dccp_sysctl_init();
- if (rc)
- goto out_ackvec_exit;
-
- rc = ccid_initialize_builtins();
- if (rc)
- goto out_sysctl_exit;
-
- dccp_timestamping_init();
-
- return 0;
-
-out_sysctl_exit:
- dccp_sysctl_exit();
-out_ackvec_exit:
- dccp_ackvec_exit();
-out_free_dccp_mib:
- dccp_mib_exit();
-out_free_dccp_bhash2:
- free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
-out_free_dccp_bhash:
- free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
-out_free_dccp_locks:
- inet_ehash_locks_free(&dccp_hashinfo);
-out_free_dccp_ehash:
- free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
-out_free_bind2_bucket_cachep:
- kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
-out_free_bind_bucket_cachep:
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
-out_free_hashinfo2:
- inet_hashinfo2_free_mod(&dccp_hashinfo);
-out_fail:
- dccp_hashinfo.bhash = NULL;
- dccp_hashinfo.bhash2 = NULL;
- dccp_hashinfo.ehash = NULL;
- dccp_hashinfo.bind_bucket_cachep = NULL;
- dccp_hashinfo.bind2_bucket_cachep = NULL;
- return rc;
-}
-
-static void __exit dccp_fini(void)
-{
- int bhash_order = get_order(dccp_hashinfo.bhash_size *
- sizeof(struct inet_bind_hashbucket));
-
- ccid_cleanup_builtins();
- dccp_mib_exit();
- free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
- free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
- free_pages((unsigned long)dccp_hashinfo.ehash,
- get_order((dccp_hashinfo.ehash_mask + 1) *
- sizeof(struct inet_ehash_bucket)));
- inet_ehash_locks_free(&dccp_hashinfo);
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- dccp_ackvec_exit();
- dccp_sysctl_exit();
- inet_hashinfo2_free_mod(&dccp_hashinfo);
-}
-
-module_init(dccp_init);
-module_exit(dccp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
deleted file mode 100644
index 5ba204ec0aca..000000000000
--- a/net/dccp/qpolicy.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/qpolicy.c
- *
- * Policy-based packet dequeueing interface for DCCP.
- *
- * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
- */
-#include "dccp.h"
-
-/*
- * Simple Dequeueing Policy:
- * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
- */
-static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
-{
- skb_queue_tail(&sk->sk_write_queue, skb);
-}
-
-static bool qpolicy_simple_full(struct sock *sk)
-{
- return dccp_sk(sk)->dccps_tx_qlen &&
- sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
-}
-
-static struct sk_buff *qpolicy_simple_top(struct sock *sk)
-{
- return skb_peek(&sk->sk_write_queue);
-}
-
-/*
- * Priority-based Dequeueing Policy:
- * If tx_qlen is different from 0 and the queue has reached its upper bound
- * of tx_qlen elements, replace older packets lowest-priority-first.
- */
-static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
-{
- struct sk_buff *skb, *best = NULL;
-
- skb_queue_walk(&sk->sk_write_queue, skb)
- if (best == NULL || skb->priority > best->priority)
- best = skb;
- return best;
-}
-
-static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
-{
- struct sk_buff *skb, *worst = NULL;
-
- skb_queue_walk(&sk->sk_write_queue, skb)
- if (worst == NULL || skb->priority < worst->priority)
- worst = skb;
- return worst;
-}
-
-static bool qpolicy_prio_full(struct sock *sk)
-{
- if (qpolicy_simple_full(sk))
- dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
- return false;
-}
-
-/**
- * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
- * @push: add a new @skb to the write queue
- * @full: indicates that no more packets will be admitted
- * @top: peeks at whatever the queueing policy defines as its `top'
- * @params: parameter passed to policy operation
- */
-struct dccp_qpolicy_operations {
- void (*push) (struct sock *sk, struct sk_buff *skb);
- bool (*full) (struct sock *sk);
- struct sk_buff* (*top) (struct sock *sk);
- __be32 params;
-};
-
-static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = {
- [DCCPQ_POLICY_SIMPLE] = {
- .push = qpolicy_simple_push,
- .full = qpolicy_simple_full,
- .top = qpolicy_simple_top,
- .params = 0,
- },
- [DCCPQ_POLICY_PRIO] = {
- .push = qpolicy_simple_push,
- .full = qpolicy_prio_full,
- .top = qpolicy_prio_best_skb,
- .params = DCCP_SCM_PRIORITY,
- },
-};
-
-/*
- * Externally visible interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
-{
- qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
-}
-
-bool dccp_qpolicy_full(struct sock *sk)
-{
- return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
-}
-
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
-{
- if (skb != NULL) {
- skb_unlink(skb, &sk->sk_write_queue);
- kfree_skb(skb);
- }
-}
-
-struct sk_buff *dccp_qpolicy_top(struct sock *sk)
-{
- return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
-}
-
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
-{
- struct sk_buff *skb = dccp_qpolicy_top(sk);
-
- if (skb != NULL) {
- /* Clear any skb fields that we used internally */
- skb->priority = 0;
- skb_unlink(skb, &sk->sk_write_queue);
- }
- return skb;
-}
-
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
-{
- /* check if exactly one bit is set */
- if (!param || (param & (param - 1)))
- return false;
- return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
-}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
deleted file mode 100644
index b15845fd6300..000000000000
--- a/net/dccp/sysctl.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * net/dccp/sysctl.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- */
-
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include "dccp.h"
-#include "feat.h"
-
-/* Boundary values */
-static int u8_max = 0xFF;
-static unsigned long seqw_min = DCCPF_SEQ_WMIN,
- seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
-
-static struct ctl_table dccp_default_table[] = {
- {
- .procname = "seq_window",
- .data = &sysctl_dccp_sequence_window,
- .maxlen = sizeof(sysctl_dccp_sequence_window),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
- .extra2 = &seqw_max,
- },
- {
- .procname = "rx_ccid",
- .data = &sysctl_dccp_rx_ccid,
- .maxlen = sizeof(sysctl_dccp_rx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &u8_max, /* RFC 4340, 10. */
- },
- {
- .procname = "tx_ccid",
- .data = &sysctl_dccp_tx_ccid,
- .maxlen = sizeof(sysctl_dccp_tx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &u8_max, /* RFC 4340, 10. */
- },
- {
- .procname = "request_retries",
- .data = &sysctl_dccp_request_retries,
- .maxlen = sizeof(sysctl_dccp_request_retries),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = &u8_max,
- },
- {
- .procname = "retries1",
- .data = &sysctl_dccp_retries1,
- .maxlen = sizeof(sysctl_dccp_retries1),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &u8_max,
- },
- {
- .procname = "retries2",
- .data = &sysctl_dccp_retries2,
- .maxlen = sizeof(sysctl_dccp_retries2),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = &u8_max,
- },
- {
- .procname = "tx_qlen",
- .data = &sysctl_dccp_tx_qlen,
- .maxlen = sizeof(sysctl_dccp_tx_qlen),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "sync_ratelimit",
- .data = &sysctl_dccp_sync_ratelimit,
- .maxlen = sizeof(sysctl_dccp_sync_ratelimit),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
-};
-
-static struct ctl_table_header *dccp_table_header;
-
-int __init dccp_sysctl_init(void)
-{
- dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
- dccp_default_table);
-
- return dccp_table_header != NULL ? 0 : -ENOMEM;
-}
-
-void dccp_sysctl_exit(void)
-{
- if (dccp_table_header != NULL) {
- unregister_net_sysctl_table(dccp_table_header);
- dccp_table_header = NULL;
- }
-}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
deleted file mode 100644
index 232ac4ae0a73..000000000000
--- a/net/dccp/timer.c
+++ /dev/null
@@ -1,272 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dccp/timer.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-#include <linux/export.h>
-
-#include "dccp.h"
-
-/* sysctl variables governing numbers of retransmission attempts */
-int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
-
-static void dccp_write_err(struct sock *sk)
-{
- sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT;
- sk_error_report(sk);
-
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- dccp_done(sk);
- __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT);
-}
-
-/* A write timeout has occurred. Process the after effects. */
-static int dccp_write_timeout(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int retry_until;
-
- if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
- if (icsk->icsk_retransmits != 0)
- dst_negative_advice(sk);
- retry_until = icsk->icsk_syn_retries ?
- : sysctl_dccp_request_retries;
- } else {
- if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
- black hole detection. :-(
-
- It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
-
- "The one security concern raised by this memo is that ICMP black holes
- are often caused by over-zealous security administrators who block
- all ICMP messages. It is vitally important that those who design and
- deploy security systems understand the impact of strict filtering on
- upper-layer protocols. The safest web site in the world is worthless
- if most TCP implementations cannot transfer data from it. It would
- be far nicer to have all of the black holes fixed rather than fixing
- all of the TCP implementations."
-
- Golden words :-).
- */
-
- dst_negative_advice(sk);
- }
-
- retry_until = sysctl_dccp_retries2;
- /*
- * FIXME: see tcp_write_timout and tcp_out_of_resources
- */
- }
-
- if (icsk->icsk_retransmits >= retry_until) {
- /* Has it gone just too far? */
- dccp_write_err(sk);
- return 1;
- }
- return 0;
-}
-
-/*
- * The DCCP retransmit timer.
- */
-static void dccp_retransmit_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- /*
- * More than 4MSL (8 minutes) has passed, a RESET(aborted) was
- * sent, no need to retransmit, this sock is dead.
- */
- if (dccp_write_timeout(sk))
- return;
-
- /*
- * We want to know the number of packets retransmitted, not the
- * total number of retransmissions of clones of original packets.
- */
- if (icsk->icsk_retransmits == 0)
- __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS);
-
- if (dccp_retransmit_skb(sk) != 0) {
- /*
- * Retransmission failed because of local congestion,
- * do not backoff.
- */
- if (--icsk->icsk_retransmits == 0)
- icsk->icsk_retransmits = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto,
- TCP_RESOURCE_PROBE_INTERVAL),
- DCCP_RTO_MAX);
- return;
- }
-
- icsk->icsk_backoff++;
-
- icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
- DCCP_RTO_MAX);
- if (icsk->icsk_retransmits > sysctl_dccp_retries1)
- __sk_dst_reset(sk);
-}
-
-static void dccp_write_timer(struct timer_list *t)
-{
- struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_retransmit_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
- int event = 0;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- jiffies + (HZ / 20));
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
- goto out;
-
- if (time_after(icsk_timeout(icsk), jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- icsk_timeout(icsk));
- goto out;
- }
-
- event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
-
- switch (event) {
- case ICSK_TIME_RETRANS:
- dccp_retransmit_timer(sk);
- break;
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static void dccp_keepalive_timer(struct timer_list *t)
-{
- struct sock *sk = from_timer(sk, t, sk_timer);
-
- pr_err("dccp should not use a keepalive timer !\n");
- sock_put(sk);
-}
-
-/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
-static void dccp_delack_timer(struct timer_list *t)
-{
- struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_delack_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- jiffies + TCP_DELACK_MIN);
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
- if (time_after(icsk_delack_timeout(icsk), jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- icsk_delack_timeout(icsk));
- goto out;
- }
-
- icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-
- if (inet_csk_ack_scheduled(sk)) {
- if (!inet_csk_in_pingpong_mode(sk)) {
- /* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1,
- icsk->icsk_rto);
- } else {
- /* Delayed ACK missed: leave pingpong mode and
- * deflate ATO.
- */
- inet_csk_exit_pingpong_mode(sk);
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- }
- dccp_send_ack(sk);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/**
- * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
- * @t: pointer to the tasklet associated with this handler
- *
- * See the comments above %ccid_dequeueing_decision for supported modes.
- */
-static void dccp_write_xmitlet(struct tasklet_struct *t)
-{
- struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet);
- struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
- else
- dccp_write_xmit(sk);
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static void dccp_write_xmit_timer(struct timer_list *t)
-{
- struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer);
-
- dccp_write_xmitlet(&dp->dccps_xmitlet);
-}
-
-void dccp_init_xmit_timers(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet);
- timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0);
- inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
- &dccp_keepalive_timer);
-}
-
-static ktime_t dccp_timestamp_seed;
-/**
- * dccp_timestamp - 10s of microseconds time source
- * Returns the number of 10s of microseconds since loading DCCP. This is native
- * DCCP time difference format (RFC 4340, sec. 13).
- * Please note: This will wrap around about circa every 11.9 hours.
- */
-u32 dccp_timestamp(void)
-{
- u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
-
- do_div(delta, 10);
- return delta;
-}
-EXPORT_SYMBOL_GPL(dccp_timestamp);
-
-void __init dccp_timestamping_init(void)
-{
- dccp_timestamp_seed = ktime_get_real();
-}
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
deleted file mode 100644
index 5a43b3508c7f..000000000000
--- a/net/dccp/trace.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM dccp
-
-#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_DCCP_H
-
-#include <net/sock.h>
-#include "dccp.h"
-#include "ccids/ccid3.h"
-#include <linux/tracepoint.h>
-#include <trace/events/net_probe_common.h>
-
-TRACE_EVENT(dccp_probe,
-
- TP_PROTO(struct sock *sk, size_t size),
-
- TP_ARGS(sk, size),
-
- TP_STRUCT__entry(
- /* sockaddr_in6 is always bigger than sockaddr_in */
- __array(__u8, saddr, sizeof(struct sockaddr_in6))
- __array(__u8, daddr, sizeof(struct sockaddr_in6))
- __field(__u16, sport)
- __field(__u16, dport)
- __field(__u16, size)
- __field(__u16, tx_s)
- __field(__u32, tx_rtt)
- __field(__u32, tx_p)
- __field(__u32, tx_x_calc)
- __field(__u64, tx_x_recv)
- __field(__u64, tx_x)
- __field(__u32, tx_t_ipi)
- ),
-
- TP_fast_assign(
- const struct inet_sock *inet = inet_sk(sk);
- struct ccid3_hc_tx_sock *hc = NULL;
-
- if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
- hc = ccid3_hc_tx_sk(sk);
-
- memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
- memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
-
- TP_STORE_ADDR_PORTS(__entry, inet, sk);
-
- /* For filtering use */
- __entry->sport = ntohs(inet->inet_sport);
- __entry->dport = ntohs(inet->inet_dport);
-
- __entry->size = size;
- if (hc) {
- __entry->tx_s = hc->tx_s;
- __entry->tx_rtt = hc->tx_rtt;
- __entry->tx_p = hc->tx_p;
- __entry->tx_x_calc = hc->tx_x_calc;
- __entry->tx_x_recv = hc->tx_x_recv >> 6;
- __entry->tx_x = hc->tx_x >> 6;
- __entry->tx_t_ipi = hc->tx_t_ipi;
- } else {
- __entry->tx_s = 0;
- memset_startat(__entry, 0, tx_rtt);
- }
- ),
-
- TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
- "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
- __entry->saddr, __entry->daddr, __entry->size,
- __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
- __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
- __entry->tx_t_ipi)
-);
-
-#endif /* _TRACE_TCP_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index d6e3db300acb..02602704bdea 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -775,7 +775,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr,
req->version_cb(version_name, version_type,
req->version_cb_priv);
- if (!req->msg)
+ if (!req->msg || !*version_value)
return 0;
nest = nla_nest_start_noflag(req->msg, attr);
diff --git a/net/devlink/health.c b/net/devlink/health.c
index 57db6799722a..b3ce8ecbb7fb 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -735,7 +735,7 @@ static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
return;
}
- item->nla_type = NLA_NUL_STRING;
+ item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING;
item->len = strlen(name) + 1;
item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
memcpy(&item->value, name, item->len);
@@ -822,32 +822,37 @@ static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
{
devlink_fmsg_err_if_binary(fmsg);
- devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_FLAG);
}
static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
{
devlink_fmsg_err_if_binary(fmsg);
- devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U8);
}
void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
{
devlink_fmsg_err_if_binary(fmsg);
- devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U32);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
{
devlink_fmsg_err_if_binary(fmsg);
- devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U64);
}
void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
{
devlink_fmsg_err_if_binary(fmsg);
- devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING);
+ devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
+ DEVLINK_VAR_ATTR_TYPE_NUL_STRING);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
@@ -857,7 +862,8 @@ void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
if (!fmsg->err && !fmsg->putting_binary)
fmsg->err = -EINVAL;
- devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
+ devlink_fmsg_put_value(fmsg, value, value_len,
+ DEVLINK_VAR_ATTR_TYPE_BINARY);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
@@ -928,43 +934,26 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
static int
-devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb)
-{
- switch (msg->nla_type) {
- case NLA_FLAG:
- case NLA_U8:
- case NLA_U32:
- case NLA_U64:
- case NLA_NUL_STRING:
- case NLA_BINARY:
- return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
- msg->nla_type);
- default:
- return -EINVAL;
- }
-}
-
-static int
devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb)
{
int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
u8 tmp;
switch (msg->nla_type) {
- case NLA_FLAG:
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
/* Always provide flag data, regardless of its value */
tmp = *(bool *)msg->value;
return nla_put_u8(skb, attrtype, tmp);
- case NLA_U8:
+ case DEVLINK_VAR_ATTR_TYPE_U8:
return nla_put_u8(skb, attrtype, *(u8 *)msg->value);
- case NLA_U32:
+ case DEVLINK_VAR_ATTR_TYPE_U32:
return nla_put_u32(skb, attrtype, *(u32 *)msg->value);
- case NLA_U64:
+ case DEVLINK_VAR_ATTR_TYPE_U64:
return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value);
- case NLA_NUL_STRING:
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
return nla_put_string(skb, attrtype, (char *)&msg->value);
- case NLA_BINARY:
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
return nla_put(skb, attrtype, msg->len, (void *)&msg->value);
default:
return -EINVAL;
@@ -998,7 +987,8 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb,
err = nla_put_flag(skb, item->attrtype);
break;
case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA:
- err = devlink_fmsg_item_fill_type(item, skb);
+ err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
+ item->nla_type);
if (err)
break;
err = devlink_fmsg_item_fill_data(item, skb);
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index f9786d51f68f..e340d955cf3b 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -10,6 +10,33 @@
#include <uapi/linux/devlink.h>
+/* Sparse enums validation callbacks */
+static int
+devlink_attr_param_type_validate(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ switch (nla_get_u8(attr)) {
+ case DEVLINK_VAR_ATTR_TYPE_U8:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U16:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U32:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U64:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
+ return 0;
+ }
+ NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value");
+ return -EINVAL;
+}
+
/* Common nested types */
const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = {
[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, },
@@ -273,7 +300,7 @@ static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VA
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
- [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate),
[DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2),
};
diff --git a/net/devlink/param.c b/net/devlink/param.c
index dcf0d1ccebba..b29abf8d3ed4 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -167,25 +167,6 @@ static int devlink_param_set(struct devlink *devlink,
}
static int
-devlink_param_type_to_nla_type(enum devlink_param_type param_type)
-{
- switch (param_type) {
- case DEVLINK_PARAM_TYPE_U8:
- return NLA_U8;
- case DEVLINK_PARAM_TYPE_U16:
- return NLA_U16;
- case DEVLINK_PARAM_TYPE_U32:
- return NLA_U32;
- case DEVLINK_PARAM_TYPE_STRING:
- return NLA_STRING;
- case DEVLINK_PARAM_TYPE_BOOL:
- return NLA_FLAG;
- default:
- return -EINVAL;
- }
-}
-
-static int
devlink_nl_param_value_fill_one(struct sk_buff *msg,
enum devlink_param_type type,
enum devlink_param_cmode cmode,
@@ -247,7 +228,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_param_gset_ctx ctx;
struct nlattr *param_values_list;
struct nlattr *param_attr;
- int nla_type;
void *hdr;
int err;
int i;
@@ -293,11 +273,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
goto param_nest_cancel;
if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
goto param_nest_cancel;
-
- nla_type = devlink_param_type_to_nla_type(param->type);
- if (nla_type < 0)
- goto param_nest_cancel;
- if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type))
goto param_nest_cancel;
param_values_list = nla_nest_start_noflag(msg,
@@ -419,25 +395,7 @@ devlink_param_type_get_from_info(struct genl_info *info,
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
return -EINVAL;
- switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
- case NLA_U8:
- *param_type = DEVLINK_PARAM_TYPE_U8;
- break;
- case NLA_U16:
- *param_type = DEVLINK_PARAM_TYPE_U16;
- break;
- case NLA_U32:
- *param_type = DEVLINK_PARAM_TYPE_U32;
- break;
- case NLA_STRING:
- *param_type = DEVLINK_PARAM_TYPE_STRING;
- break;
- case NLA_FLAG:
- *param_type = DEVLINK_PARAM_TYPE_BOOL;
- break;
- default:
- return -EINVAL;
- }
+ *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]);
return 0;
}
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 5c9d1798e830..082573ae6864 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -116,19 +116,15 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp)
bool dsa_port_supports_hwtstamp(struct dsa_port *dp)
{
+ struct kernel_hwtstamp_config config = {};
struct dsa_switch *ds = dp->ds;
- struct ifreq ifr = {};
int err;
if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set)
return false;
- /* "See through" shim implementations of the "get" method.
- * Since we can't cook up a complete ioctl request structure, this will
- * fail in copy_to_user() with -EFAULT, which hopefully is enough to
- * detect a valid implementation.
- */
- err = ds->ops->port_hwtstamp_get(ds, dp->index, &ifr);
+ /* "See through" shim implementations of the "get" method. */
+ err = ds->ops->port_hwtstamp_get(ds, dp->index, &config);
return err != -EOPNOTSUPP;
}
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 804dc7dac4f2..e9334520c54a 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -578,20 +578,6 @@ dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
struct dsa_user_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->dp->ds;
- int port = p->dp->index;
-
- /* Pass through to switch driver if it supports timestamping */
- switch (cmd) {
- case SIOCGHWTSTAMP:
- if (ds->ops->port_hwtstamp_get)
- return ds->ops->port_hwtstamp_get(ds, port, ifr);
- break;
- case SIOCSHWTSTAMP:
- if (ds->ops->port_hwtstamp_set)
- return ds->ops->port_hwtstamp_set(ds, port, ifr);
- break;
- }
return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
}
@@ -2574,6 +2560,31 @@ static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
return 0;
}
+static int dsa_user_hwtstamp_get(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_get(ds, dp->index, cfg);
+}
+
+static int dsa_user_hwtstamp_set(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack);
+}
+
static const struct net_device_ops dsa_user_netdev_ops = {
.ndo_open = dsa_user_open,
.ndo_stop = dsa_user_close,
@@ -2595,6 +2606,8 @@ static const struct net_device_ops dsa_user_netdev_ops = {
.ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid,
.ndo_change_mtu = dsa_user_change_mtu,
.ndo_fill_forward_path = dsa_user_fill_forward_path,
+ .ndo_hwtstamp_get = dsa_user_hwtstamp_get,
+ .ndo_hwtstamp_set = dsa_user_hwtstamp_set,
};
static const struct device_type dsa_type = {
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 49bea6b45bd5..eb253e0fd61b 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -921,9 +921,18 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev,
phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc);
if (IS_ERR(phy))
- err = PTR_ERR(phy);
- else
- err = 0;
+ return PTR_ERR(phy);
+
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phy->phyindex;
+ }
+ err = 0;
+ } else if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
}
info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
@@ -947,10 +956,20 @@ int __ethtool_get_ts_info(struct net_device *dev,
ethtool_init_tsinfo(info);
if (phy_is_default_hwtstamp(phydev) &&
- phy_has_tsinfo(phydev))
+ phy_has_tsinfo(phydev)) {
err = phy_ts_info(phydev, info);
- else if (ops->get_ts_info)
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phydev->phyindex;
+ }
+ } else if (ops->get_ts_info) {
err = ops->get_ts_info(dev, info);
+ if (!err && info->phc_index >= 0)
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
+ }
info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
SOF_TIMESTAMPING_SOFTWARE;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 8262cc10f98d..39ec920f5de7 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -978,6 +978,88 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
return 0;
}
+static bool flow_type_hashable(u32 flow_type)
+{
+ switch (flow_type) {
+ case TCP_V4_FLOW:
+ case UDP_V4_FLOW:
+ case SCTP_V4_FLOW:
+ case AH_ESP_V4_FLOW:
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V4_FLOW:
+ case ESP_V4_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV4_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V4_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V4_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V4_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V4_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V4_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V4_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
+/* When adding a new type, update the assert and, if it's hashable, add it to
+ * the flow_type_hashable switch case.
+ */
+static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT);
+
+static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh)
+{
+ /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then:
+ * 1 - no other fields besides IP src/dst and/or L4 src/dst are set
+ * 2 - If src is set, dst must also be set
+ */
+ if ((input_xfrm != RXH_XFRM_NO_CHANGE &&
+ input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) &&
+ ((rxfh & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) ||
+ (!!(rxfh & RXH_IP_SRC) ^ !!(rxfh & RXH_IP_DST)) ||
+ (!!(rxfh & RXH_L4_B_0_1) ^ !!(rxfh & RXH_L4_B_2_3))))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info = {
+ .cmd = ETHTOOL_GRXFH,
+ };
+ int err;
+ u32 i;
+
+ for (i = 0; i < __FLOW_TYPE_COUNT; i++) {
+ if (!flow_type_hashable(i))
+ continue;
+
+ info.flow_type = i;
+ err = ops->get_rxnfc(dev, &info, NULL);
+ if (err)
+ continue;
+
+ err = ethtool_check_xfrm_rxfh(input_xfrm, info.data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
u32 cmd, void __user *useraddr)
{
@@ -1012,16 +1094,9 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
if (rc)
return rc;
- /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then:
- * 1 - no other fields besides IP src/dst and/or L4 src/dst
- * 2 - If src is set, dst must also be set
- */
- if ((rxfh.input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) &&
- ((info.data & ~(RXH_IP_SRC | RXH_IP_DST |
- RXH_L4_B_0_1 | RXH_L4_B_2_3)) ||
- (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) ||
- (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3))))
- return -EINVAL;
+ rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data);
+ if (rc)
+ return rc;
}
rc = ops->set_rxnfc(dev, &info);
@@ -1413,6 +1488,10 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
rxfh.input_xfrm == RXH_XFRM_NO_CHANGE))
return -EINVAL;
+ ret = ethtool_check_flow_types(dev, rxfh.input_xfrm);
+ if (ret)
+ return ret;
+
indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]);
/* Check settings which may be global rather than per RSS-context */
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index 2816bb23c3ad..ad9b40034003 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright 2022-2023 NXP
+ * Copyright 2022-2025 NXP
+ * Copyright 2024 Furong Xu <0x1207@gmail.com>
*/
#include "common.h"
#include "netlink.h"
@@ -282,3 +283,279 @@ bool ethtool_dev_mm_supported(struct net_device *dev)
return supported;
}
EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
+
+static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv,
+ bool tx_active)
+{
+ if (mmsv->ops->configure_tx)
+ mmsv->ops->configure_tx(mmsv, tx_active);
+}
+
+static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv,
+ bool pmac_enabled)
+{
+ if (mmsv->ops->configure_pmac)
+ mmsv->ops->configure_pmac(mmsv, pmac_enabled);
+}
+
+static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv,
+ enum ethtool_mpacket mpacket)
+{
+ if (mmsv->ops->send_mpacket)
+ mmsv->ops->send_mpacket(mmsv, mpacket);
+}
+
+/**
+ * ethtool_mmsv_verify_timer - Timer for MAC Merge verification
+ * @t: timer_list struct containing private info
+ *
+ * Verify the MAC Merge capability in the local TX direction, by
+ * transmitting Verify mPackets up to 3 times. Wait until link
+ * partner responds with a Response mPacket, otherwise fail.
+ */
+static void ethtool_mmsv_verify_timer(struct timer_list *t)
+{
+ struct ethtool_mmsv *mmsv = from_timer(mmsv, t, verify_timer);
+ unsigned long flags;
+ bool rearm = false;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ switch (mmsv->status) {
+ case ETHTOOL_MM_VERIFY_STATUS_INITIAL:
+ case ETHTOOL_MM_VERIFY_STATUS_VERIFYING:
+ if (mmsv->verify_retries != 0) {
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY);
+ rearm = true;
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED;
+ }
+
+ mmsv->verify_retries--;
+ break;
+
+ case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED:
+ ethtool_mmsv_configure_tx(mmsv, true);
+ break;
+
+ default:
+ break;
+ }
+
+ if (rearm) {
+ mod_timer(&mmsv->verify_timer,
+ jiffies + msecs_to_jiffies(mmsv->verify_time));
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+
+static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv)
+{
+ if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) {
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ mod_timer(&mmsv->verify_timer, jiffies);
+ }
+}
+
+static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv)
+{
+ /* If verification is disabled, configure FPE right away.
+ * Otherwise let the timer code do it.
+ */
+ if (!mmsv->verify_enabled) {
+ ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled);
+ ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled);
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+
+ if (netif_running(mmsv->dev))
+ ethtool_mmsv_verify_timer_arm(mmsv);
+ }
+}
+
+/**
+ * ethtool_mmsv_stop() - Stop MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ *
+ * Drivers should call this method in a state where the hardware is
+ * about to lose state, like ndo_stop() or suspend(), and turning off
+ * MAC Merge features would be superfluous. Otherwise, prefer
+ * ethtool_mmsv_link_state_handle() with up=false.
+ */
+void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv)
+{
+ timer_shutdown_sync(&mmsv->verify_timer);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_stop);
+
+/**
+ * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification
+ * of link state changes
+ * @mmsv: MAC Merge Software Verification state
+ * @up: True if device carrier is up and able to pass verification packets
+ *
+ * Calling context is expected to be from a task, interrupts enabled.
+ */
+void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up)
+{
+ unsigned long flags;
+
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ if (up && mmsv->pmac_enabled) {
+ /* VERIFY process requires pMAC enabled when NIC comes up */
+ ethtool_mmsv_configure_pmac(mmsv, true);
+
+ /* New link => maybe new partner => new verification process */
+ ethtool_mmsv_apply(mmsv);
+ } else {
+ /* Reset the reported verification state while the link is down */
+ if (mmsv->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+
+ /* No link or pMAC not enabled */
+ ethtool_mmsv_configure_pmac(mmsv, false);
+ ethtool_mmsv_configure_tx(mmsv, false);
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle);
+
+/**
+ * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification
+ * of interrupt-based events
+ * @mmsv: MAC Merge Software Verification state
+ * @event: Event which took place (packet transmission or reception)
+ *
+ * Calling context expects to have interrupts disabled.
+ */
+void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv,
+ enum ethtool_mmsv_event event)
+{
+ /* This is interrupt context, just spin_lock() */
+ spin_lock(&mmsv->lock);
+
+ if (!mmsv->pmac_enabled)
+ goto unlock;
+
+ switch (event) {
+ case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET:
+ /* Link partner has sent verify mPacket */
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE);
+ break;
+ case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET:
+ /* Local device has sent verify mPacket */
+ if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING;
+ break;
+ case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET:
+ /* Link partner has sent response mPacket */
+ if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED;
+ break;
+ }
+
+unlock:
+ spin_unlock(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle);
+
+static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv)
+{
+ /* TX is active if administratively enabled, and verification either
+ * succeeded, or was administratively disabled.
+ */
+ return mmsv->tx_enabled &&
+ (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED ||
+ mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED);
+}
+
+/**
+ * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @state: see struct ethtool_mm_state
+ *
+ * Drivers are expected to call this from their ethtool_ops :: get_mm()
+ * method.
+ */
+void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv,
+ struct ethtool_mm_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ state->verify_enabled = mmsv->verify_enabled;
+ state->pmac_enabled = mmsv->pmac_enabled;
+ state->verify_time = mmsv->verify_time;
+ state->tx_enabled = mmsv->tx_enabled;
+ state->verify_status = mmsv->status;
+ state->tx_active = ethtool_mmsv_is_tx_active(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm);
+
+/**
+ * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @cfg: see struct ethtool_mm_cfg
+ *
+ * Drivers are expected to call this from their ethtool_ops :: set_mm()
+ * method.
+ */
+void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg)
+{
+ unsigned long flags;
+
+ /* Wait for the verification that's currently in progress to finish */
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ mmsv->verify_enabled = cfg->verify_enabled;
+ mmsv->pmac_enabled = cfg->pmac_enabled;
+ mmsv->verify_time = cfg->verify_time;
+ mmsv->tx_enabled = cfg->tx_enabled;
+
+ if (!cfg->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+
+ ethtool_mmsv_apply(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm);
+
+/**
+ * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state
+ * @mmsv: MAC Merge Software Verification state
+ * @dev: Pointer to network interface
+ * @ops: Methods for implementing the generic functionality
+ *
+ * The MAC Merge Software Verification is a timer- and event-based state
+ * machine intended for network interfaces which lack a hardware-based
+ * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer
+ * is managed by the core code, whereas events are supplied by the
+ * driver explicitly calling one of the other API functions.
+ */
+void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev,
+ const struct ethtool_mmsv_ops *ops)
+{
+ mmsv->ops = ops;
+ mmsv->dev = dev;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+ mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ spin_lock_init(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_init);
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 977beeaaa2f9..9de828df46cd 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -357,6 +357,18 @@ struct ethnl_dump_ctx {
unsigned long pos_ifindex;
};
+/**
+ * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks
+ * @ethnl_ctx: generic ethnl context
+ * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev
+ * @pos_phyindex: iterator position for multi-msg DUMP
+ */
+struct ethnl_perphy_dump_ctx {
+ struct ethnl_dump_ctx ethnl_ctx;
+ unsigned int ifindex;
+ unsigned long pos_phyindex;
+};
+
static const struct ethnl_request_ops *
ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
[ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops,
@@ -400,6 +412,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
[ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops,
[ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops,
[ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_PHY_GET] = &ethnl_phy_request_ops,
};
static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -407,6 +420,12 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
return (struct ethnl_dump_ctx *)cb->ctx;
}
+static struct ethnl_perphy_dump_ctx *
+ethnl_perphy_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_perphy_dump_ctx *)cb->ctx;
+}
+
/**
* ethnl_default_parse() - Parse request message
* @req_info: pointer to structure to put data into
@@ -584,18 +603,19 @@ static int ethnl_default_dumpit(struct sk_buff *skb,
{
struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
struct net_device *dev;
int ret = 0;
rcu_read_lock();
for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
- dev_hold(dev);
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
rcu_read_unlock();
ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb));
rcu_read_lock();
- dev_put(dev);
+ netdev_put(dev, &dev_tracker);
if (ret < 0 && ret != -EOPNOTSUPP) {
if (likely(skb->len))
@@ -662,6 +682,173 @@ free_req_info:
return ret;
}
+/* per-PHY ->start() handler for GET requests */
+static int ethnl_perphy_start(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx;
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ /* Unlike per-dev dump, don't ignore dev. The dump handler
+ * will notice it and dump PHYs from given dev. We only keep track of
+ * the dev's ifindex, .dumpit() will grab and release the netdev itself.
+ */
+ ret = ethnl_default_parse(req_info, &info->info, ops, false);
+ if (req_info->dev) {
+ phy_ctx->ifindex = req_info->dev->ifindex;
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ }
+ if (ret < 0)
+ goto free_reply_data;
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+static int ethnl_perphy_dump_one_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net_device *dev = ethnl_ctx->req_info->dev;
+ struct phy_device_node *pdn;
+ int ret;
+
+ if (!dev->link_topo)
+ return 0;
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ ethnl_ctx->req_info->phy_index = ctx->pos_phyindex;
+
+ /* We can re-use the original dump_one as ->prepare_data in
+ * commands use ethnl_req_get_phydev(), which gets the PHY from
+ * the req_info->phy_index
+ */
+ ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info);
+ if (ret)
+ return ret;
+ }
+
+ ctx->pos_phyindex = 0;
+
+ return 0;
+}
+
+static int ethnl_perphy_dump_all_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) {
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ /* per-PHY commands use ethnl_req_get_phydev(), which needs the
+ * net_device in the req_info
+ */
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, info);
+
+ rcu_read_lock();
+ netdev_put(dev, &dev_tracker);
+ ethnl_ctx->req_info->dev = NULL;
+
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ ret = skb->len;
+ break;
+ }
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* per-PHY ->dumpit() handler for GET requests. */
+static int ethnl_perphy_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ int ret = 0;
+
+ if (ctx->ifindex) {
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ dev = netdev_get_by_index(genl_info_net(&info->info),
+ ctx->ifindex, &dev_tracker,
+ GFP_KERNEL);
+ if (!dev)
+ return -ENODEV;
+
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb));
+
+ if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len))
+ ret = skb->len;
+
+ netdev_put(dev, &dev_tracker);
+ } else {
+ ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb));
+ }
+
+ return ret;
+}
+
+/* per-PHY ->done() handler for GET requests */
+static int ethnl_perphy_done(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+
+ kfree(ethnl_ctx->reply_data);
+ kfree(ethnl_ctx->req_info);
+
+ return 0;
+}
+
/* default ->done() handler for GET requests */
static int ethnl_default_done(struct netlink_callback *cb)
{
@@ -1200,9 +1387,9 @@ static const struct genl_ops ethtool_genl_ops[] = {
{
.cmd = ETHTOOL_MSG_PSE_GET,
.doit = ethnl_default_doit,
- .start = ethnl_default_start,
- .dumpit = ethnl_default_dumpit,
- .done = ethnl_default_done,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
.policy = ethnl_pse_get_policy,
.maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
},
@@ -1224,9 +1411,9 @@ static const struct genl_ops ethtool_genl_ops[] = {
{
.cmd = ETHTOOL_MSG_PLCA_GET_CFG,
.doit = ethnl_default_doit,
- .start = ethnl_default_start,
- .dumpit = ethnl_default_dumpit,
- .done = ethnl_default_done,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
.policy = ethnl_plca_get_cfg_policy,
.maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1,
},
@@ -1240,9 +1427,9 @@ static const struct genl_ops ethtool_genl_ops[] = {
{
.cmd = ETHTOOL_MSG_PLCA_GET_STATUS,
.doit = ethnl_default_doit,
- .start = ethnl_default_start,
- .dumpit = ethnl_default_dumpit,
- .done = ethnl_default_done,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
.policy = ethnl_plca_get_status_policy,
.maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1,
},
@@ -1271,10 +1458,10 @@ static const struct genl_ops ethtool_genl_ops[] = {
},
{
.cmd = ETHTOOL_MSG_PHY_GET,
- .doit = ethnl_phy_doit,
- .start = ethnl_phy_start,
- .dumpit = ethnl_phy_dumpit,
- .done = ethnl_phy_done,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
.policy = ethnl_phy_get_policy,
.maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
},
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index ec6ab5443a6f..91b953924af3 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -499,10 +499,6 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info);
int ethnl_rss_dump_start(struct netlink_callback *cb);
int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
-int ethnl_phy_start(struct netlink_callback *cb);
-int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info);
-int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
-int ethnl_phy_done(struct netlink_callback *cb);
int ethnl_tsinfo_start(struct netlink_callback *cb);
int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
int ethnl_tsinfo_done(struct netlink_callback *cb);
diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c
index 1f590e8d75ed..68372bef4b2f 100644
--- a/net/ethtool/phy.c
+++ b/net/ethtool/phy.c
@@ -12,304 +12,154 @@
#include <net/netdev_lock.h>
struct phy_req_info {
- struct ethnl_req_info base;
- struct phy_device_node *pdn;
+ struct ethnl_req_info base;
};
-#define PHY_REQINFO(__req_base) \
- container_of(__req_base, struct phy_req_info, base)
+struct phy_reply_data {
+ struct ethnl_reply_data base;
+ u32 phyindex;
+ char *drvname;
+ char *name;
+ unsigned int upstream_type;
+ char *upstream_sfp_name;
+ unsigned int upstream_index;
+ char *downstream_sfp_name;
+};
+
+#define PHY_REPDATA(__reply_base) \
+ container_of(__reply_base, struct phy_reply_data, base)
const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = {
[ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
};
-/* Caller holds rtnl */
-static ssize_t
-ethnl_phy_reply_size(const struct ethnl_req_info *req_base,
- struct netlink_ext_ack *extack)
+static int phy_reply_size(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
{
- struct phy_req_info *req_info = PHY_REQINFO(req_base);
- struct phy_device_node *pdn = req_info->pdn;
- struct phy_device *phydev = pdn->phy;
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
size_t size = 0;
- ASSERT_RTNL();
-
/* ETHTOOL_A_PHY_INDEX */
size += nla_total_size(sizeof(u32));
/* ETHTOOL_A_DRVNAME */
- if (phydev->drv)
- size += nla_total_size(strlen(phydev->drv->name) + 1);
+ if (rep_data->drvname)
+ size += nla_total_size(strlen(rep_data->drvname) + 1);
/* ETHTOOL_A_NAME */
- size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1);
+ size += nla_total_size(strlen(rep_data->name) + 1);
/* ETHTOOL_A_PHY_UPSTREAM_TYPE */
size += nla_total_size(sizeof(u32));
- if (phy_on_sfp(phydev)) {
- const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus);
-
- /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */
- if (upstream_sfp_name)
- size += nla_total_size(strlen(upstream_sfp_name) + 1);
+ /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */
+ if (rep_data->upstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1);
- /* ETHTOOL_A_PHY_UPSTREAM_INDEX */
+ /* ETHTOOL_A_PHY_UPSTREAM_INDEX */
+ if (rep_data->upstream_index)
size += nla_total_size(sizeof(u32));
- }
/* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */
- if (phydev->sfp_bus) {
- const char *sfp_name = sfp_get_name(phydev->sfp_bus);
-
- if (sfp_name)
- size += nla_total_size(strlen(sfp_name) + 1);
- }
+ if (rep_data->downstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1);
return size;
}
-static int
-ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb)
+static int phy_prepare_data(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ const struct genl_info *info)
{
- struct phy_req_info *req_info = PHY_REQINFO(req_base);
- struct phy_device_node *pdn = req_info->pdn;
- struct phy_device *phydev = pdn->phy;
- enum phy_upstream ptype;
+ struct phy_link_topology *topo = reply_data->dev->link_topo;
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+ struct nlattr **tb = info->attrs;
+ struct phy_device_node *pdn;
+ struct phy_device *phydev;
- ptype = pdn->upstream_type;
+ /* RTNL is held by the caller */
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev))
+ return -EOPNOTSUPP;
- if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) ||
- nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) ||
- nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype))
- return -EMSGSIZE;
+ pdn = xa_load(&topo->phys, phydev->phyindex);
+ if (!pdn)
+ return -EOPNOTSUPP;
- if (phydev->drv &&
- nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name))
- return -EMSGSIZE;
+ rep_data->phyindex = phydev->phyindex;
+ rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL);
+ rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL);
+ rep_data->upstream_type = pdn->upstream_type;
- if (ptype == PHY_UPSTREAM_PHY) {
+ if (pdn->upstream_type == PHY_UPSTREAM_PHY) {
struct phy_device *upstream = pdn->upstream.phydev;
- const char *sfp_upstream_name;
-
- /* Parent index */
- if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex))
- return -EMSGSIZE;
-
- if (pdn->parent_sfp_bus) {
- sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus);
- if (sfp_upstream_name &&
- nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,
- sfp_upstream_name))
- return -EMSGSIZE;
- }
- }
-
- if (phydev->sfp_bus) {
- const char *sfp_name = sfp_get_name(phydev->sfp_bus);
-
- if (sfp_name &&
- nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,
- sfp_name))
- return -EMSGSIZE;
+ rep_data->upstream_index = upstream->phyindex;
}
- return 0;
-}
-
-static int ethnl_phy_parse_request(struct ethnl_req_info *req_base,
- struct nlattr **tb,
- struct netlink_ext_ack *extack)
-{
- struct phy_link_topology *topo = req_base->dev->link_topo;
- struct phy_req_info *req_info = PHY_REQINFO(req_base);
- struct phy_device *phydev;
+ if (pdn->parent_sfp_bus)
+ rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus),
+ GFP_KERNEL);
- phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PHY_HEADER,
- extack);
- if (!phydev)
- return 0;
-
- if (IS_ERR(phydev))
- return PTR_ERR(phydev);
-
- if (!topo)
- return 0;
-
- req_info->pdn = xa_load(&topo->phys, phydev->phyindex);
+ if (phydev->sfp_bus)
+ rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus),
+ GFP_KERNEL);
return 0;
}
-int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info)
+static int phy_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
{
- struct phy_req_info req_info = {};
- struct nlattr **tb = info->attrs;
- struct sk_buff *rskb;
- void *reply_payload;
- int reply_len;
- int ret;
-
- ret = ethnl_parse_header_dev_get(&req_info.base,
- tb[ETHTOOL_A_PHY_HEADER],
- genl_info_net(info), info->extack,
- true);
- if (ret < 0)
- return ret;
-
- rtnl_lock();
- netdev_lock_ops(req_info.base.dev);
-
- ret = ethnl_phy_parse_request(&req_info.base, tb, info->extack);
- if (ret < 0)
- goto err_unlock;
-
- /* No PHY, return early */
- if (!req_info.pdn)
- goto err_unlock;
-
- ret = ethnl_phy_reply_size(&req_info.base, info->extack);
- if (ret < 0)
- goto err_unlock;
- reply_len = ret + ethnl_reply_header_size();
-
- rskb = ethnl_reply_init(reply_len, req_info.base.dev,
- ETHTOOL_MSG_PHY_GET_REPLY,
- ETHTOOL_A_PHY_HEADER,
- info, &reply_payload);
- if (!rskb) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- ret = ethnl_phy_fill_reply(&req_info.base, rskb);
- if (ret)
- goto err_free_msg;
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
- netdev_unlock_ops(req_info.base.dev);
- rtnl_unlock();
- ethnl_parse_header_dev_put(&req_info.base);
- genlmsg_end(rskb, reply_payload);
-
- return genlmsg_reply(rskb, info);
-
-err_free_msg:
- nlmsg_free(rskb);
-err_unlock:
- netdev_unlock_ops(req_info.base.dev);
- rtnl_unlock();
- ethnl_parse_header_dev_put(&req_info.base);
- return ret;
-}
-
-struct ethnl_phy_dump_ctx {
- struct phy_req_info *phy_req_info;
- unsigned long ifindex;
- unsigned long phy_index;
-};
-
-int ethnl_phy_start(struct netlink_callback *cb)
-{
- const struct genl_info *info = genl_info_dump(cb);
- struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
- int ret;
-
- BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
-
- ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL);
- if (!ctx->phy_req_info)
- return -ENOMEM;
-
- ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base,
- info->attrs[ETHTOOL_A_PHY_HEADER],
- sock_net(cb->skb->sk), cb->extack,
- false);
- ctx->ifindex = 0;
- ctx->phy_index = 0;
-
- if (ret)
- kfree(ctx->phy_req_info);
+ if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) ||
+ nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) ||
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type))
+ return -EMSGSIZE;
- return ret;
-}
+ if (rep_data->drvname &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname))
+ return -EMSGSIZE;
-int ethnl_phy_done(struct netlink_callback *cb)
-{
- struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
+ if (rep_data->upstream_index &&
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX,
+ rep_data->upstream_index))
+ return -EMSGSIZE;
- if (ctx->phy_req_info->base.dev)
- ethnl_parse_header_dev_put(&ctx->phy_req_info->base);
+ if (rep_data->upstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,
+ rep_data->upstream_sfp_name))
+ return -EMSGSIZE;
- kfree(ctx->phy_req_info);
+ if (rep_data->downstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,
+ rep_data->downstream_sfp_name))
+ return -EMSGSIZE;
return 0;
}
-static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev,
- struct netlink_callback *cb)
+static void phy_cleanup_data(struct ethnl_reply_data *reply_data)
{
- struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
- struct phy_req_info *pri = ctx->phy_req_info;
- struct phy_device_node *pdn;
- int ret = 0;
- void *ehdr;
-
- if (!dev->link_topo)
- return 0;
-
- xa_for_each_start(&dev->link_topo->phys, ctx->phy_index, pdn, ctx->phy_index) {
- ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_PHY_GET_REPLY);
- if (!ehdr) {
- ret = -EMSGSIZE;
- break;
- }
-
- ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_PHY_HEADER);
- if (ret < 0) {
- genlmsg_cancel(skb, ehdr);
- break;
- }
-
- pri->pdn = pdn;
- ret = ethnl_phy_fill_reply(&pri->base, skb);
- if (ret < 0) {
- genlmsg_cancel(skb, ehdr);
- break;
- }
-
- genlmsg_end(skb, ehdr);
- }
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
- return ret;
+ kfree(rep_data->drvname);
+ kfree(rep_data->name);
+ kfree(rep_data->upstream_sfp_name);
+ kfree(rep_data->downstream_sfp_name);
}
-int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
- struct net *net = sock_net(skb->sk);
- struct net_device *dev;
- int ret = 0;
-
- rtnl_lock();
-
- if (ctx->phy_req_info->base.dev) {
- dev = ctx->phy_req_info->base.dev;
- netdev_lock_ops(dev);
- ret = ethnl_phy_dump_one_dev(skb, dev, cb);
- netdev_unlock_ops(dev);
- } else {
- for_each_netdev_dump(net, dev, ctx->ifindex) {
- netdev_lock_ops(dev);
- ret = ethnl_phy_dump_one_dev(skb, dev, cb);
- netdev_unlock_ops(dev);
- if (ret)
- break;
-
- ctx->phy_index = 0;
- }
- }
- rtnl_unlock();
-
- return ret;
-}
+const struct ethnl_request_ops ethnl_phy_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PHY_GET,
+ .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PHY_HEADER,
+ .req_info_size = sizeof(struct phy_req_info),
+ .reply_data_size = sizeof(struct phy_reply_data),
+
+ .prepare_data = phy_prepare_data,
+ .reply_size = phy_reply_size,
+ .fill_reply = phy_fill_reply,
+ .cleanup_data = phy_cleanup_data,
+};
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
index 8130b406ef10..8c654caa6805 100644
--- a/net/ethtool/tsinfo.c
+++ b/net/ethtool/tsinfo.c
@@ -160,6 +160,12 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
/* _TSINFO_HWTSTAMP_PROVIDER */
len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32));
}
+ if (ts_info->phc_source) {
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */
+ if (ts_info->phc_phyindex)
+ /* _TSINFO_HWTSTAMP_PHYINDEX */
+ len += nla_total_size(sizeof(u32));
+ }
if (req_base->flags & ETHTOOL_FLAG_STATS)
len += nla_total_size(0) + /* _TSINFO_STATS */
nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT;
@@ -259,6 +265,16 @@ static int tsinfo_fill_reply(struct sk_buff *skb,
nla_nest_end(skb, nest);
}
+ if (ts_info->phc_source) {
+ if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE,
+ ts_info->phc_source))
+ return -EMSGSIZE;
+
+ if (ts_info->phc_phyindex &&
+ nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX,
+ ts_info->phc_phyindex))
+ return -EMSGSIZE;
+ }
if (req_base->flags & ETHTOOL_FLAG_STATS &&
tsinfo_put_stats(skb, &data->stats))
return -EMSGSIZE;
@@ -346,6 +362,11 @@ static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb,
if (ret < 0)
goto err;
+ if (reply_data->ts_info.phc_index >= 0) {
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ reply_data->ts_info.phc_phyindex = phydev->phyindex;
+ }
+
ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr);
if (ret < 0)
goto err;
@@ -389,6 +410,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb,
if (ret < 0)
goto err;
+ if (reply_data->ts_info.phc_index >= 0)
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV;
ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data,
ehdr);
if (ret < 0)
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 1b1b700ec05e..0d1e56965af0 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -761,6 +761,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
if (res)
goto err_unregister;
+ if (protocol_version == PRP_V1) {
+ eth_hw_addr_set(slave[1], slave[0]->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]);
+ }
+
if (interlink) {
res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack);
if (res)
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index d7ae32473c41..192893c3f2ec 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -78,6 +78,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
eth_hw_addr_set(master->dev, dev->dev_addr);
call_netdevice_notifiers(NETDEV_CHANGEADDR,
master->dev);
+
+ if (hsr->prot_version == PRP_V1) {
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port) {
+ eth_hw_addr_set(port->dev, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR,
+ port->dev);
+ }
+ }
}
/* Make sure we recognize frames from ourselves in hsr_rcv() */
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 1bc47b17a296..135ec5fce019 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -155,6 +155,7 @@ struct hsr_port {
struct hsr_priv *hsr;
enum hsr_port_type type;
struct rcu_head rcu;
+ unsigned char original_macaddress[ETH_ALEN];
};
struct hsr_frame_info;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 2a802a5de2ac..b87b6a6fe070 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -196,6 +196,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
port->hsr = hsr;
port->dev = dev;
port->type = type;
+ ether_addr_copy(port->original_macaddress, dev->dev_addr);
if (type != HSR_PT_MASTER) {
res = hsr_portdev_setup(hsr, dev, port, extack);
@@ -232,6 +233,7 @@ void hsr_del_port(struct hsr_port *port)
if (!port->hsr->fwd_offloaded)
dev_set_promiscuity(port->dev, -1);
netdev_upper_dev_unlink(port->dev, master->dev);
+ eth_hw_addr_set(port->dev, port->original_macaddress);
}
kfree_rcu(port, rcu);
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 359249ab77bf..4c07a475c567 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -224,10 +224,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
dev_hold(dev);
if (info->attrs[IEEE802154_ATTR_HW_ADDR]) {
- struct sockaddr addr;
+ struct sockaddr_storage addr;
- addr.sa_family = ARPHRD_IEEE802154;
- nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR],
+ addr.ss_family = ARPHRD_IEEE802154;
+ nla_memcpy(&addr.__data, info->attrs[IEEE802154_ATTR_HW_ADDR],
IEEE802154_ADDR_LEN);
/* strangely enough, some callbacks (inetdev_event) from
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d2c97f8e9ef..12850a277251 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -425,7 +425,7 @@ config INET_DIAG
tristate "INET: socket monitoring interface"
default y
help
- Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ Support for INET (TCP, UDP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5df1f1325259..76e38092cd8a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk)
/* Routing failed... */
sk->sk_route_caps = 0;
- /*
- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
- */
+
if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 77e5705ac799..c47d3828d4f6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1792,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
struct ifaddrmsg *ifm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
return -EINVAL;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3f4e629998fa..fd1e1507a224 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -807,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_MULTIPATH:
err = lwtunnel_valid_encap_type_attr(nla_data(attr),
nla_len(attr),
- extack, false);
+ extack);
if (err < 0)
goto errout;
cfg->fc_mp = nla_data(attr);
@@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
case RTA_ENCAP_TYPE:
cfg->fc_encap_type = nla_get_u16(attr);
err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
- extack, false);
+ extack);
if (err < 0)
goto errout;
break;
@@ -948,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
if (filter->rtnl_held)
ASSERT_RTNL();
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
return -EINVAL;
}
- rtm = nlmsg_data(nlh);
if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
rtm->rtm_scope) {
NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f68bb9e34c34..d643bd1a0d9d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -365,7 +365,7 @@ static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
{
/* The second half is used for prefsrc */
- return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *),
+ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head),
GFP_KERNEL);
}
@@ -2168,34 +2168,52 @@ static bool fib_good_nh(const struct fib_nh *nh)
return !!(state & NUD_VALID);
}
-void fib_select_multipath(struct fib_result *res, int hash)
+void fib_select_multipath(struct fib_result *res, int hash,
+ const struct flowi4 *fl4)
{
struct fib_info *fi = res->fi;
struct net *net = fi->fib_net;
- bool first = false;
+ bool found = false;
+ bool use_neigh;
+ __be32 saddr;
if (unlikely(res->fi->nh)) {
nexthop_path_fib_result(res, hash);
return;
}
+ use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
+ saddr = fl4 ? fl4->saddr : 0;
+
change_nexthops(fi) {
- if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
- if (!fib_good_nh(nexthop_nh))
- continue;
- if (!first) {
- res->nh_sel = nhsel;
- res->nhc = &nexthop_nh->nh_common;
- first = true;
- }
+ int nh_upper_bound;
+
+ /* Nexthops without a carrier are assigned an upper bound of
+ * minus one when "ignore_routes_with_linkdown" is set.
+ */
+ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound);
+ if (nh_upper_bound == -1 ||
+ (use_neigh && !fib_good_nh(nexthop_nh)))
+ continue;
+
+ if (!found) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ found = !saddr || nexthop_nh->nh_saddr == saddr;
}
- if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
+ if (hash > nh_upper_bound)
continue;
- res->nh_sel = nhsel;
- res->nhc = &nexthop_nh->nh_common;
- return;
+ if (!saddr || nexthop_nh->nh_saddr == saddr) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ return;
+ }
+
+ if (found)
+ return;
+
} endfor_nexthops(fi);
}
#endif
@@ -2210,7 +2228,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, fl4);
}
else
#endif
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 6701a98d9a9f..dafd68f3436a 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -199,7 +199,7 @@ static const struct net_protocol net_gre_protocol = {
static int __init gre_init(void)
{
- pr_info("GRE over IPv4 demultiplexor driver\n");
+ pr_info("GRE over IPv4 demultiplexer driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index dd5cf8914a28..20915895bdaa 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -330,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
struct inet_bind2_bucket **tb2_ret,
struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
int i, low, high, attempt_half, port, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
@@ -512,10 +512,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
bool found_port = false, check_bind_conflict = true;
bool bhash_created = false, bhash2_created = false;
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
int ret = -EADDRINUSE, port = snum, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2 = NULL;
@@ -767,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
{
@@ -780,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
-EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_clear_xmit_timers_sync(struct sock *sk)
{
@@ -831,7 +829,6 @@ no_route:
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_route_req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
@@ -898,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
req->num_retrans++;
return err;
}
-EXPORT_SYMBOL(inet_rtx_syn_ack);
static struct request_sock *
reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
@@ -1026,9 +1022,10 @@ static bool reqsk_queue_unlink(struct request_sock *req)
bool found = false;
if (sk_hashed(sk)) {
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
- spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ spinlock_t *lock;
+ lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
spin_lock(lock);
found = __sk_nulls_del_node_init_rcu(sk);
spin_unlock(lock);
@@ -1058,14 +1055,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
return __inet_csk_reqsk_queue_drop(sk, req, false);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
{
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put);
static void reqsk_timer_handler(struct timer_list *t)
{
@@ -1209,7 +1205,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
inet_csk_reqsk_queue_added(sk);
return true;
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
const gfp_t priority)
@@ -1290,7 +1285,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
return newsk;
}
-EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
@@ -1322,7 +1316,7 @@ void inet_csk_destroy_sock(struct sock *sk)
EXPORT_SYMBOL(inet_csk_destroy_sock);
/* This function allows to force a closure of a socket after the call to
- * tcp/dccp_create_openreq_child().
+ * tcp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
__releases(&sk->sk_lock.slock)
@@ -1380,7 +1374,6 @@ int inet_csk_listen_start(struct sock *sk)
inet_sk_set_state(sk, TCP_CLOSE);
return err;
}
-EXPORT_SYMBOL_GPL(inet_csk_listen_start);
static void inet_child_forget(struct sock *sk, struct request_sock *req,
struct sock *child)
@@ -1475,7 +1468,6 @@ child_put:
sock_put(child);
return NULL;
}
-EXPORT_SYMBOL(inet_csk_complete_hashdance);
/*
* This routine closes sockets which have been at least partially
@@ -1590,4 +1582,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
out:
return dst;
}
-EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c2bb91d9e9ff..1d1d6ad53f4c 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -160,7 +160,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
ext & (1 << (INET_DIAG_TCLASS - 1))) {
u32 classid = 0;
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUP_NET_CLASSID
classid = sock_cgroup_classid(&sk->sk_cgrp_data);
#endif
/* Fallback to socket priority if class id isn't set.
@@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type)
switch (type) {
case TCPDIAG_GETSOCK:
return IPPROTO_TCP;
- case DCCPDIAG_GETSOCK:
- return IPPROTO_DCCP;
default:
return 0;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5bf163f756e9..77a0b52b2eab 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -23,11 +23,12 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/inet6_hashtables.h>
#endif
-#include <net/secure_seq.h>
#include <net/hotdata.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/rps.h>
+#include <net/secure_seq.h>
#include <net/sock_reuseport.h>
+#include <net/tcp.h>
u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -176,7 +177,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
*/
static void __inet_put_port(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
@@ -215,7 +216,7 @@ EXPORT_SYMBOL(inet_put_port);
int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
- struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *table = tcp_get_hashinfo(sk);
unsigned short port = inet_sk(child)->inet_num;
struct inet_bind_hashbucket *head, *head2;
bool created_inet_bind_bucket = false;
@@ -668,7 +669,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk,
*/
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_ehash_bucket *head;
struct hlist_nulls_head *list;
spinlock_t *lock;
@@ -713,7 +714,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
}
return ok;
}
-EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+EXPORT_IPV6_MOD(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
struct inet_listen_hashbucket *ilb)
@@ -740,7 +741,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_listen_hashbucket *ilb2;
int err = 0;
@@ -771,7 +772,7 @@ unlock:
return err;
}
-EXPORT_SYMBOL(__inet_hash);
+EXPORT_IPV6_MOD(__inet_hash);
int inet_hash(struct sock *sk)
{
@@ -782,15 +783,15 @@ int inet_hash(struct sock *sk)
return err;
}
-EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
if (sk_unhashed(sk))
return;
+ sock_rps_delete_flow(sk);
if (sk->sk_state == TCP_LISTEN) {
struct inet_listen_hashbucket *ilb2;
@@ -823,7 +824,7 @@ void inet_unhash(struct sock *sk)
spin_unlock_bh(lock);
}
}
-EXPORT_SYMBOL_GPL(inet_unhash);
+EXPORT_IPV6_MOD(inet_unhash);
static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
const struct net *net, unsigned short port,
@@ -874,7 +875,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
@@ -902,7 +903,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family)
static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2, *new_tb2;
int l3mdev = inet_sk_bound_l3mdev(sk);
@@ -982,14 +983,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
{
return __inet_bhash2_update_saddr(sk, saddr, family, false);
}
-EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
+EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
void inet_bhash2_reset_saddr(struct sock *sk)
{
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
__inet_bhash2_update_saddr(sk, NULL, 0, true);
}
-EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
+EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers')
@@ -1214,7 +1215,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
__inet_check_established);
}
-EXPORT_SYMBOL_GPL(inet_hash_connect);
static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
@@ -1265,7 +1265,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
init_hashinfo_lhash2(h);
return 0;
}
-EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
@@ -1305,7 +1304,6 @@ set_mask:
hashinfo->ehash_locks_mask = nblocks - 1;
return 0;
}
-EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
unsigned int ehash_entries)
@@ -1341,7 +1339,6 @@ free_hashinfo:
err:
return NULL;
}
-EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
{
@@ -1352,4 +1349,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
vfree(hashinfo->ehash);
kfree(hashinfo);
}
-EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index aded4bf1bc16..67efe9501581 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
spin_unlock(lock);
local_bh_enable();
}
-EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);
static void tw_timer_handler(struct timer_list *t)
{
@@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
return tw;
}
-EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
@@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
-EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
void inet_twsk_purge(struct inet_hashinfo *hashinfo)
@@ -365,4 +362,3 @@ restart:
rcu_read_unlock();
}
}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 26d15f907551..f5b9004d6938 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1066,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net)
return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
-static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipgre_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
- .exit_batch_rtnl = ipgre_exit_batch_rtnl,
+ .exit_rtnl = ipgre_exit_rtnl,
.id = &ipgre_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1752,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net)
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
-static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipgre_tap_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill);
}
static struct pernet_operations ipgre_tap_net_ops = {
.init = ipgre_tap_init_net,
- .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl,
+ .exit_rtnl = ipgre_tap_exit_rtnl,
.id = &gre_tap_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1772,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net)
&erspan_link_ops, "erspan0");
}
-static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
+static void __net_exit erspan_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill);
}
static struct pernet_operations erspan_net_ops = {
.init = erspan_init_net,
- .exit_batch_rtnl = erspan_exit_batch_rtnl,
+ .exit_rtnl = erspan_exit_rtnl,
.id = &erspan_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6e18d7ec5062..a2705d454fd6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk,
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 1024f961ec9a..678b8f96e3e9 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -243,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strscpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
if (strlen(ops->kind) > (IFNAMSIZ - 3))
goto failed;
- strcpy(name, ops->kind);
+ strscpy(name, ops->kind);
strcat(name, "%d");
}
@@ -1174,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
- struct list_head *head,
- struct rtnl_link_ops *ops)
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+ struct rtnl_link_ops *ops,
+ struct list_head *head)
{
+ struct ip_tunnel_net *itn = net_generic(net, id);
struct net_device *dev, *aux;
int h;
+ ASSERT_RTNL_NET(net);
+
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == ops)
unregister_netdevice_queue(dev, head);
@@ -1198,21 +1201,7 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
unregister_netdevice_queue(t->dev, head);
}
}
-
-void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
- struct rtnl_link_ops *ops,
- struct list_head *dev_to_kill)
-{
- struct ip_tunnel_net *itn;
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list) {
- itn = net_generic(net, id);
- ip_tunnel_destroy(net, itn, dev_to_kill, ops);
- }
-}
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
int ip_tunnel_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 159b4473290e..686e4f3d83aa 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net)
return 0;
}
-static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit vti_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill);
}
static struct pernet_operations vti_net_ops = {
.init = vti_init_net,
- .exit_batch_rtnl = vti_exit_batch_rtnl,
+ .exit_rtnl = vti_exit_rtnl,
.id = &vti_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index bab0bf90c908..3e03af073a1c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -604,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net)
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
-static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipip_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
- .exit_batch_rtnl = ipip_exit_batch_rtnl,
+ .exit_rtnl = ipip_exit_rtnl,
.id = &ipip_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 85dc208f32e9..2ff2f79c7351 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2501,7 +2501,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
return -EINVAL;
}
@@ -2510,7 +2511,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_ipv4_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
(rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
@@ -2826,7 +2826,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
{
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
return -EINVAL;
}
@@ -2836,7 +2837,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3d101613f27f..23c8deff8095 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -270,7 +270,7 @@ ipt_do_table(void *priv,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 25e1e8eb18dd..ed08fb78cfa8 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
struct iphdr *iph;
local_bh_disable();
- if (this_cpu_read(nf_skb_duplicated))
+ if (current->in_nf_duplicate)
goto out;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 9082ca17e845..7e7c49535e3f 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
else
addr = iph->saddr;
- *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) {
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ return;
+ }
+
+ *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);
}
EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
@@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_proto = pkt->tprot,
.flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
- .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),
};
const struct net_device *oif;
const struct net_device *found;
@@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
else
oif = NULL;
+ fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif);
+
iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
if (!iph) {
regs->verdict.code = NFT_BREAK;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 467151517023..4397e89d3123 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void)
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fdb_list);
+ spin_lock_init(&nh->lock);
}
return nh;
}
@@ -1555,12 +1556,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
if (nh->is_group) {
struct nh_group *nhg;
- nhg = rtnl_dereference(nh->nh_grp);
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
if (nhg->has_v4)
goto no_v4_nh;
is_fdb_nh = nhg->fdb_nh;
} else {
- nhi = rtnl_dereference(nh->nh_info);
+ nhi = rcu_dereference_rtnl(nh->nh_info);
if (nhi->family == AF_INET)
goto no_v4_nh;
is_fdb_nh = nhi->fdb_nh;
@@ -2118,7 +2119,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
/* not called for nexthop replace */
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
- struct fib6_info *f6i, *tmp;
+ struct fib6_info *f6i;
bool do_flush = false;
struct fib_info *fi;
@@ -2129,13 +2130,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
if (do_flush)
fib_flush(net);
- /* ip6_del_rt removes the entry from this list hence the _safe */
- list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+ spin_lock_bh(&nh->lock);
+
+ nh->dead = true;
+
+ while (!list_empty(&nh->f6i_list)) {
+ f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list);
+
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
+
+ spin_unlock_bh(&nh->lock);
ipv6_stub->ip6_del_rt(net, f6i,
!READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+
+ spin_lock_bh(&nh->lock);
}
+
+ spin_unlock_bh(&nh->lock);
}
static void __remove_nexthop(struct net *net, struct nexthop *nh,
@@ -3168,8 +3180,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
}
cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
- err = lwtunnel_valid_encap_type(cfg->nh_encap_type,
- extack, false);
+ err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
if (err < 0)
goto out;
@@ -4040,14 +4051,11 @@ out:
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
-static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
+static void __net_exit nexthop_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- flush_all_nexthops(net);
+ ASSERT_RTNL_NET(net);
+ flush_all_nexthops(net);
}
static void __net_exit nexthop_net_exit(struct net *net)
@@ -4072,7 +4080,7 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
.exit = nexthop_net_exit,
- .exit_batch_rtnl = nexthop_net_exit_batch_rtnl,
+ .exit_rtnl = nexthop_net_exit_rtnl,
};
static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 10cbeb76c274..ea2f01584379 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -191,6 +191,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
+ SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 753704f75b2c..fccb05fb3a79 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#ifndef CONFIG_PREEMPT_RT
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+#else
+#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
+#endif
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -2037,8 +2041,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
hash_keys.addrs.v4addrs.dst = fl4->daddr;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
hash_keys.basic.ip_proto = fl4->flowi4_proto;
- if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
- hash_keys.ports.src = fl4->fl4_sport;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
+ }
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl4->fl4_dport;
@@ -2093,7 +2101,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
- hash_keys.ports.src = fl4->fl4_sport;
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
hash_keys.ports.dst = fl4->fl4_dport;
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
@@ -2154,7 +2165,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, NULL);
IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif
@@ -2699,8 +2710,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
if (fl4->saddr) {
if (ipv4_is_multicast(fl4->saddr) ||
- ipv4_is_lbcast(fl4->saddr) ||
- ipv4_is_zeronet(fl4->saddr)) {
+ ipv4_is_lbcast(fl4->saddr)) {
rth = ERR_PTR(-EINVAL);
goto out;
}
@@ -3206,7 +3216,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack,
"ipv4: Invalid header for route get request");
return -EINVAL;
@@ -3216,7 +3227,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_ipv4_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
(rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
rtm->rtm_table || rtm->rtm_protocol ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6edc441b3702..f64f8276a73c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
+ struct net_devmem_dmabuf_binding *binding = NULL;
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
@@ -1066,11 +1067,20 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
+ int sockc_err = 0;
int zc = 0;
long timeo;
flags = msg->msg_flags;
+ sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+ if (msg->msg_controllen) {
+ sockc_err = sock_cmsg_send(sk, msg, &sockc);
+ /* Don't return error until MSG_FASTOPEN has been processed;
+ * that may succeed even if the cmsg is invalid.
+ */
+ }
+
if ((flags & MSG_ZEROCOPY) && size) {
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
@@ -1078,7 +1088,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
skb = tcp_write_queue_tail(sk);
- uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
+ !sockc_err && sockc.dmabuf_id);
if (!uarg) {
err = -ENOBUFS;
goto out_err;
@@ -1087,12 +1098,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
else
uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ if (!sockc_err && sockc.dmabuf_id) {
+ binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ binding = NULL;
+ goto out_err;
+ }
+ }
}
} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
if (sk->sk_route_caps & NETIF_F_SG)
zc = MSG_SPLICE_PAGES;
}
+ if (!sockc_err && sockc.dmabuf_id &&
+ (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
if (unlikely(flags & MSG_FASTOPEN ||
inet_test_bit(DEFER_CONNECT, sk)) &&
!tp->repair) {
@@ -1131,13 +1157,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)};
- if (msg->msg_controllen) {
- err = sock_cmsg_send(sk, msg, &sockc);
- if (unlikely(err)) {
- err = -EINVAL;
- goto out_err;
- }
+ if (sockc_err) {
+ err = sockc_err;
+ goto out_err;
}
/* This should be in poll */
@@ -1160,6 +1182,8 @@ restart:
if (skb)
copy = size_goal - skb->len;
+ trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
+
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
@@ -1256,7 +1280,8 @@ new_segment:
goto wait_for_space;
}
- err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+ binding);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
goto new_segment;
@@ -1337,6 +1362,8 @@ out_nopush:
/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
if (uarg && !msg->msg_ubuf)
net_zcopy_put(uarg);
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
return copied + copied_syn;
do_error:
@@ -1354,6 +1381,9 @@ out_err:
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
@@ -3407,6 +3437,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rack.reo_wnd_persist = 0;
tp->rack.dsack_seen = 0;
tp->syn_data_acked = 0;
+ tp->syn_fastopen_child = 0;
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
@@ -4162,6 +4193,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)
info->tcpi_options |= TCPI_OPT_USEC_TS;
+ if (tp->syn_fastopen_child)
+ info->tcpi_options |= TCPI_OPT_TFO_CHILD;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
@@ -5194,7 +5227,7 @@ void __init tcp_init(void)
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
- max_rshare = min(6UL*1024*1024, limit);
+ max_rshare = min(32UL*1024*1024, limit);
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 1a6b1bc54245..9b83d639b5ac 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -401,6 +401,7 @@ fastopen:
}
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
+ tcp_sk(child)->syn_fastopen_child = 1;
return child;
}
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a35018e2d0ba..8ec92dec321a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
- u32 new_sample = tp->rcv_rtt_est.rtt_us;
- long m = sample;
+ u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
+ long m = sample << 3;
- if (new_sample != 0) {
+ if (old_sample == 0 || m < old_sample) {
+ new_sample = m;
+ } else {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
@@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too
* long.
*/
- if (!win_dep) {
- m -= (new_sample >> 3);
- new_sample += m;
- } else {
- m <<= 3;
- if (m < new_sample)
- new_sample = m;
- }
- } else {
- /* No previous measure. */
- new_sample = m << 3;
+ if (win_dep)
+ return;
+ /* Do not use this sample if receive queue is not empty. */
+ if (tp->rcv_nxt != tp->copied_seq)
+ return;
+ new_sample = old_sample - (old_sample >> 3) + sample;
}
tp->rcv_rtt_est.rtt_us = new_sample;
@@ -712,7 +709,7 @@ new_measure:
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
-static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
+static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
{
u32 delta, delta_us;
@@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
- delta = 1;
+ delta = min_delta;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
return delta_us;
}
@@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
- s32 delta = tcp_rtt_tsopt_us(tp);
+ s32 delta = tcp_rtt_tsopt_us(tp, 0);
- if (delta >= 0)
+ if (delta > 0)
tcp_rcv_rtt_update(tp, delta, 0);
}
}
+static void tcp_rcvbuf_grow(struct sock *sk)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int rcvwin, rcvbuf, cap;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return;
+
+ /* slow start: allow the sender to double its rate. */
+ rcvwin = tp->rcvq_space.space << 1;
+
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ /* Make the window clamp follow along. */
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
+ }
+}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
@@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 copied;
- int time;
+ int time, inq, copied;
trace_tcp_rcv_space_adjust(sk);
@@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
/* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq;
+ /* Number of bytes in receive queue. */
+ inq = tp->rcv_nxt - tp->copied_seq;
+ copied -= inq;
if (copied <= tp->rcvq_space.space)
goto new_measure;
- /* A bit of theory :
- * copied = bytes received in previous RTT, our base window
- * To cope with packet losses, we need a 2x factor
- * To cope with slow start, and sender growing its cwin by 100 %
- * every RTT, we need a 4x factor, because the ACK we are sending
- * now is for the next RTT, not the current one :
- * <prev RTT . ><current RTT .. ><next RTT .... >
- */
-
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- u64 rcvwin, grow;
- int rcvbuf;
-
- /* minimal window to cope with packet losses, assuming
- * steady state. Add some cushion because of small variations.
- */
- rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
+ trace_tcp_rcvbuf_grow(sk, time);
- /* Accommodate for sender rate increase (eg. slow start) */
- grow = rcvwin * (copied - tp->rcvq_space.space);
- do_div(grow, tp->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
- if (rcvbuf > sk->sk_rcvbuf) {
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
- /* Make the window clamp follow along. */
- WRITE_ONCE(tp->window_clamp,
- tcp_win_from_space(sk, rcvbuf));
- }
- }
tp->rcvq_space.space = copied;
+ tcp_rcvbuf_grow(sk);
+
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp;
@@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
- seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
+ seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
@@ -5173,6 +5168,7 @@ end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
+ tcp_rcvbuf_grow(sk);
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
if (req) {
tcp_rcv_synrecv_state_fastopen(sk);
} else {
@@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8cce0d5489da..6a14f9e6fef6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2417,7 +2417,8 @@ do_time_wait:
goto csum_error;
}
- tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn);
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
switch (tw_status) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(net,
@@ -3494,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of 16 TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
+ /* Default TSQ limit of 4 MB */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index fb9349be36b8..43d7852ce07e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
- const struct tcphdr *th, u32 *tw_isn)
+ const struct tcphdr *th, u32 *tw_isn,
+ enum skb_drop_reason *drop_reason)
{
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
@@ -245,8 +246,10 @@ kill:
return TCP_TW_SYN;
}
- if (paws_reject)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+ if (paws_reject) {
+ *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
+ }
if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 13295a59d22e..3ac8d2d17e1f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2619,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
- if (sk->sk_pacing_status == SK_PACING_NONE)
- limit = min_t(unsigned long, limit,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
+ limit = min_t(unsigned long, limit,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2742cc7602bb..dde52b8050b8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -93,6 +93,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
+#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -119,6 +120,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
+#include <net/rps.h>
struct udp_table udp_table __read_mostly;
@@ -1942,8 +1944,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
error = -EAGAIN;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb) {
if (!(flags & MSG_PEEK))
udp_skb_destructor(sk, skb);
@@ -1964,8 +1966,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb && !(flags & MSG_PEEK))
udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
@@ -2199,6 +2201,7 @@ void udp_lib_unhash(struct sock *sk)
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
+ sock_rps_delete_flow(sk);
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2897,20 +2900,40 @@ void udp_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (udp_test_bit(ENCAP_ENABLED, sk))
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
static_branch_dec(&udp_encap_needed_key);
+ udp_tunnel_cleanup_gro(sk);
+ }
}
}
+typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
struct sock *sk)
{
#ifdef CONFIG_XFRM
+ udp_gro_receive_t new_gro_receive;
+
if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
- if (family == AF_INET)
- WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv);
- else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
- WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv);
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv;
+ else
+ new_gro_receive = xfrm4_gro_udp_encap_rcv;
+
+ if (udp_sk(sk)->gro_receive != new_gro_receive) {
+ /*
+ * With IPV6_ADDRFORM the gro callback could change
+ * after being set, unregister the old one, if valid.
+ */
+ if (udp_sk(sk)->gro_receive)
+ udp_tunnel_update_gro_rcv(sk, false);
+
+ WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
+ udp_tunnel_update_gro_rcv(sk, true);
+ }
}
#endif
}
@@ -2960,6 +2983,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_ENCAP:
+ sockopt_lock_sock(sk);
switch (val) {
case 0:
#ifdef CONFIG_XFRM
@@ -2983,6 +3007,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
err = -ENOPROTOOPT;
break;
}
+ sockopt_release_sock(sk);
break;
case UDP_NO_CHECK6_TX:
@@ -3000,13 +3025,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_GRO:
-
+ sockopt_lock_sock(sk);
/* when enabling GRO, accept the related GSO packet type */
if (valbool)
udp_tunnel_encap_enable(sk);
udp_assign_bit(GRO_ENABLED, sk, valbool);
udp_assign_bit(ACCEPT_L4, sk, valbool);
set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
+ sockopt_release_sock(sk);
break;
/*
@@ -3390,34 +3416,55 @@ struct bpf_iter__udp {
int bucket __aligned(8);
};
+union bpf_udp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
struct bpf_udp_iter_state {
struct udp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
- int offset;
- struct sock **batch;
- bool st_bucket_done;
+ union bpf_udp_iter_batch_item *batch;
};
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz);
+ unsigned int new_batch_sz, gfp_t flags);
+static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
+ union bpf_udp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct sock *sk = NULL;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ udp_portaddr_for_each_entry_from(sk)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ goto done;
+ }
+done:
+ return sk;
+}
+
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
+ unsigned int find_cookie, end_cookie;
struct net *net = seq_file_net(seq);
- int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
- bool resized = false;
+ int resume_bucket;
+ int resizes = 0;
struct sock *sk;
+ int err = 0;
resume_bucket = state->bucket;
- resume_offset = iter->offset;
/* The current batch is done, so advance the bucket. */
- if (iter->st_bucket_done)
+ if (iter->cur_sk == iter->end_sk)
state->bucket++;
udptable = udp_get_table_seq(seq, net);
@@ -3430,62 +3477,89 @@ again:
* before releasing the bucket lock. This allows BPF programs that are
* called in seq_show to acquire the bucket lock if needed.
*/
+ find_cookie = iter->cur_sk;
+ end_cookie = iter->end_sk;
iter->cur_sk = 0;
iter->end_sk = 0;
- iter->st_bucket_done = false;
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
if (hlist_empty(&hslot2->head))
- continue;
+ goto next_bucket;
- iter->offset = 0;
spin_lock_bh(&hslot2->lock);
- udp_portaddr_for_each_entry(sk, &hslot2->head) {
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+ /* Resume from the first (in iteration order) unseen socket from
+ * the last batch that still exists in resume_bucket. Most of
+ * the time this will just be where the last iteration left off
+ * in resume_bucket unless that socket disappeared between
+ * reads.
+ */
+ if (state->bucket == resume_bucket)
+ sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
if (seq_sk_match(seq, sk)) {
- /* Resume from the last iterated socket at the
- * offset in the bucket before iterator was stopped.
- */
- if (state->bucket == resume_bucket &&
- iter->offset < resume_offset) {
- ++iter->offset;
- continue;
- }
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
}
batch_sks++;
}
}
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1].sk;
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk)
break;
+next_bucket:
+ resizes = 0;
}
- /* All done: no batch made. */
- if (!iter->end_sk)
- return NULL;
-
- if (iter->end_sk == batch_sks) {
- /* Batching is done for the current bucket; return the first
- * socket to be iterated from the batch.
- */
- iter->st_bucket_done = true;
- goto done;
- }
- if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
- resized = true;
- /* After allocating a larger batch, retry one more time to grab
- * the whole bucket.
- */
- goto again;
- }
-done:
- return iter->batch[0];
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0].sk : NULL;
}
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3496,16 +3570,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* Whenever seq_next() is called, the iter->cur_sk is
* done with seq_show(), so unref the iter->cur_sk.
*/
- if (iter->cur_sk < iter->end_sk) {
- sock_put(iter->batch[iter->cur_sk++]);
- ++iter->offset;
- }
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++].sk);
/* After updating iter->cur_sk, check if there are more sockets
* available in the current bucket batch.
*/
if (iter->cur_sk < iter->end_sk)
- sk = iter->batch[iter->cur_sk];
+ sk = iter->batch[iter->cur_sk].sk;
else
/* Prepare a new batch. */
sk = bpf_iter_udp_batch(seq);
@@ -3569,8 +3641,19 @@ unlock:
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
- while (iter->cur_sk < iter->end_sk)
- sock_put(iter->batch[iter->cur_sk++]);
+ union bpf_udp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_put(item->sk);
+ item->cookie = cookie;
+ }
}
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
@@ -3586,10 +3669,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
}
- if (iter->cur_sk < iter->end_sk) {
+ if (iter->cur_sk < iter->end_sk)
bpf_iter_udp_put_batch(iter);
- iter->st_bucket_done = false;
- }
}
static const struct seq_operations bpf_iter_udp_seq_ops = {
@@ -3810,6 +3891,15 @@ fallback:
static int __net_init udp_pernet_init(struct net *net)
{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ int i;
+
+ /* No tunnel is configured */
+ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
+ INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
+ RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
+ }
+#endif
udp_sysctl_init(net);
udp_set_table(net);
@@ -3831,16 +3921,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz)
+ unsigned int new_batch_sz, gfp_t flags)
{
- struct sock **new_batch;
+ union bpf_udp_iter_batch_item *new_batch;
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
- GFP_USER | __GFP_NOWARN);
+ flags | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
- bpf_iter_udp_put_batch(iter);
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
@@ -3859,10 +3952,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
if (ret)
return ret;
- ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
if (ret)
bpf_iter_fini_seq_net(priv_data);
+ iter->state.bucket = -1;
+
return ret;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 9a8142ccbabe..9c775f8aa438 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -12,6 +12,169 @@
#include <net/udp.h>
#include <net/protocol.h>
#include <net/inet_common.h>
+#include <net/udp_tunnel.h>
+
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+
+/*
+ * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL
+ * values for the udp tunnel static call.
+ */
+static struct sk_buff *dummy_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+struct udp_tunnel_type_entry {
+ udp_tunnel_gro_rcv_t gro_receive;
+ refcount_t count;
+};
+
+#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \
+ IS_ENABLED(CONFIG_VXLAN) * 2 + \
+ IS_ENABLED(CONFIG_NET_FOU) * 2 + \
+ IS_ENABLED(CONFIG_XFRM) * 2)
+
+DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv);
+static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call);
+static struct mutex udp_tunnel_gro_type_lock;
+static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES];
+static unsigned int udp_tunnel_gro_type_nr;
+static DEFINE_SPINLOCK(udp_tunnel_gro_lock);
+
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ struct udp_sock *tup, *up = udp_sk(sk);
+ struct udp_tunnel_gro *udp_tunnel_gro;
+
+ spin_lock(&udp_tunnel_gro_lock);
+ udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6];
+ if (add)
+ hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list);
+ else if (up->tunnel_list.pprev)
+ hlist_del_init(&up->tunnel_list);
+
+ if (udp_tunnel_gro->list.first &&
+ !udp_tunnel_gro->list.first->next) {
+ tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock,
+ tunnel_list);
+
+ rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup);
+ } else {
+ RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL);
+ }
+
+ spin_unlock(&udp_tunnel_gro_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup);
+
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add)
+{
+ struct udp_tunnel_type_entry *cur = NULL;
+ struct udp_sock *up = udp_sk(sk);
+ int i, old_gro_type_nr;
+
+ if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive)
+ return;
+
+ mutex_lock(&udp_tunnel_gro_type_lock);
+
+ /* Check if the static call is permanently disabled. */
+ if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES)
+ goto out;
+
+ for (i = 0; i < udp_tunnel_gro_type_nr; i++)
+ if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive)
+ cur = &udp_tunnel_gro_types[i];
+
+ old_gro_type_nr = udp_tunnel_gro_type_nr;
+ if (add) {
+ /*
+ * Update the matching entry, if found, or add a new one
+ * if needed
+ */
+ if (cur) {
+ refcount_inc(&cur->count);
+ goto out;
+ }
+
+ if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) {
+ pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n");
+ /* Ensure static call will never be enabled */
+ udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1;
+ } else {
+ cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++];
+ refcount_set(&cur->count, 1);
+ cur->gro_receive = up->gro_receive;
+ }
+ } else {
+ /*
+ * The stack cleanups only successfully added tunnel, the
+ * lookup on removal should never fail.
+ */
+ if (WARN_ON_ONCE(!cur))
+ goto out;
+
+ if (!refcount_dec_and_test(&cur->count))
+ goto out;
+
+ /* Avoid gaps, so that the enable tunnel has always id 0 */
+ *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr];
+ }
+
+ if (udp_tunnel_gro_type_nr == 1) {
+ static_call_update(udp_tunnel_gro_rcv,
+ udp_tunnel_gro_types[0].gro_receive);
+ static_branch_enable(&udp_tunnel_static_call);
+ } else if (old_gro_type_nr == 1) {
+ static_branch_disable(&udp_tunnel_static_call);
+ static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv);
+ }
+
+out:
+ mutex_unlock(&udp_tunnel_gro_type_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv);
+
+static void udp_tunnel_gro_init(void)
+{
+ mutex_init(&udp_tunnel_gro_type_lock);
+}
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ if (static_branch_likely(&udp_tunnel_static_call)) {
+ if (unlikely(gro_recursion_inc_test(skb))) {
+ NAPI_GRO_CB(skb)->flush |= 1;
+ return NULL;
+ }
+ return static_call(udp_tunnel_gro_rcv)(sk, head, skb);
+ }
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#else
+
+static void udp_tunnel_gro_init(void) {}
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#endif
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -681,7 +844,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+ pp = udp_tunnel_gro_rcv(sk, head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
@@ -694,8 +857,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
{
const struct iphdr *iph = skb_gro_network_header(skb);
struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
int iif, sdif;
+ sk = udp_tunnel_sk(net, false);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
inet_get_iif_sdif(skb, &iif, &sdif);
return __udp4_lib_lookup(net, iph->saddr, sport,
@@ -826,5 +994,7 @@ int __init udpv4_offload_init(void)
.gro_complete = udp4_gro_complete,
},
};
+
+ udp_tunnel_gro_init();
return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP);
}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 619a53eb672d..2326548997d3 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -58,6 +58,15 @@ error:
}
EXPORT_SYMBOL(udp_sock_create4);
+static bool sk_saddr_any(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#else
+ return !sk->sk_rcv_saddr;
+#endif
+}
+
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
struct udp_tunnel_sock_cfg *cfg)
{
@@ -80,6 +89,12 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->gro_complete = cfg->gro_complete;
udp_tunnel_encap_enable(sk);
+
+ udp_tunnel_update_gro_rcv(sk, true);
+
+ if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) &&
+ sk->sk_kern_sock)
+ udp_tunnel_update_gro_lookup(net, sk, true);
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c6b22170dc49..43b19adfbf88 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5349,12 +5349,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
struct ifaddrmsg *ifm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
return -EINVAL;
@@ -5487,7 +5487,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
struct ifaddrmsg *ifm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request");
return -EINVAL;
}
@@ -5496,7 +5497,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
ifa_ipv6_policy, extack);
- ifm = nlmsg_data(nlh);
if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
return -EINVAL;
@@ -6115,7 +6115,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
{
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
return -EINVAL;
}
@@ -6125,7 +6126,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index ab054f329e12..fb63ffbcfc64 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -473,12 +473,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
{
struct ifaddrlblmsg *ifal;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
return -EINVAL;
}
- ifal = nlmsg_data(nlh);
if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
@@ -543,7 +543,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb,
struct ifaddrlblmsg *ifal;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request");
return -EINVAL;
}
@@ -552,7 +553,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb,
IFAL_MAX, ifal_policy, extack);
- ifal = nlmsg_data(nlh);
if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request");
return -EINVAL;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f60ec8b0f8ea..acaff1296783 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -715,6 +715,7 @@ const struct proto_ops inet6_stream_ops = {
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
+EXPORT_SYMBOL_GPL(inet6_stream_ops);
const struct proto_ops inet6_dgram_ops = {
.family = PF_INET6,
@@ -881,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
}
return false;
}
-EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index dbcf556a35bb..8f500eaf33cf 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
return dst;
}
-EXPORT_SYMBOL(inet6_csk_route_req);
static inline
struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
@@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
dst = inet6_csk_route_socket(sk, &fl6);
return IS_ERR(dst) ? NULL : dst;
}
-EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
index 09065187378e..40df8bdfaacd 100644
--- a/net/ipv6/ioam6_iptunnel.c
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -38,6 +38,7 @@ struct ioam6_lwt_freq {
};
struct ioam6_lwt {
+ struct dst_entry null_dst;
struct dst_cache cache;
struct ioam6_lwt_freq freq;
atomic_t pkt_cnt;
@@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla,
if (err)
goto free_lwt;
+ /* This "fake" dst_entry will be stored in a dst_cache, which will call
+ * dst_hold() and dst_release() on it. We must ensure that dst_destroy()
+ * will never be called. For that, its initial refcount is 1 and +1 when
+ * it is stored in the cache. Then, +1/-1 each time we read the cache
+ * and release it. Long story short, we're fine.
+ */
+ dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT);
+
atomic_set(&ilwt->pkt_cnt, 0);
ilwt->freq.k = freq_k;
ilwt->freq.n = freq_n;
@@ -336,7 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL;
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
struct ioam6_lwt *ilwt;
int err = -EINVAL;
u32 pkt_cnt;
@@ -344,7 +354,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
goto drop;
- ilwt = ioam6_lwt_state(dst->lwtstate);
+ ilwt = ioam6_lwt_state(orig_dst->lwtstate);
/* Check for insertion frequency (i.e., "k over n" insertions) */
pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
@@ -352,9 +362,20 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
goto out;
local_bh_disable();
- cache_dst = dst_cache_get(&ilwt->cache);
+ dst = dst_cache_get(&ilwt->cache);
local_bh_enable();
+ /* This is how we notify that the destination does not change after
+ * transformation and that we need to use orig_dst instead of the cache
+ */
+ if (dst == &ilwt->null_dst) {
+ dst_release(dst);
+
+ dst = orig_dst;
+ /* keep refcount balance: dst_release() is called at the end */
+ dst_hold(dst);
+ }
+
switch (ilwt->mode) {
case IOAM6_IPTUNNEL_MODE_INLINE:
do_inline:
@@ -362,7 +383,7 @@ do_inline:
if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
goto out;
- err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst);
+ err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst);
if (unlikely(err))
goto drop;
@@ -372,7 +393,7 @@ do_encap:
/* Encapsulation (ip6ip6) */
err = ioam6_do_encap(net, skb, &ilwt->tuninfo,
ilwt->has_tunsrc, &ilwt->tunsrc,
- &ilwt->tundst, cache_dst);
+ &ilwt->tundst, dst);
if (unlikely(err))
goto drop;
@@ -390,7 +411,7 @@ do_encap:
goto drop;
}
- if (unlikely(!cache_dst)) {
+ if (unlikely(!dst)) {
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct flowi6 fl6;
@@ -401,20 +422,27 @@ do_encap:
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = hdr->nexthdr;
- cache_dst = ip6_route_output(net, NULL, &fl6);
- if (cache_dst->error) {
- err = cache_dst->error;
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
goto drop;
}
- /* cache only if we don't create a dst reference loop */
- if (dst->lwtstate != cache_dst->lwtstate) {
- local_bh_disable();
- dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr);
- local_bh_enable();
- }
-
- err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev));
+ /* If the destination is the same after transformation (which is
+ * a valid use case for IOAM), then we don't want to add it to
+ * the cache in order to avoid a reference loop. Instead, we add
+ * our fake dst_entry to the cache as a way to detect this case.
+ * Otherwise, we add the resolved destination to the cache.
+ */
+ local_bh_disable();
+ if (orig_dst->lwtstate == dst->lwtstate)
+ dst_cache_set_ip6(&ilwt->cache,
+ &ilwt->null_dst, &fl6.saddr);
+ else
+ dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
if (unlikely(err))
goto drop;
}
@@ -422,22 +450,26 @@ do_encap:
/* avoid lwtunnel_output() reentry loop when destination is the same
* after transformation (e.g., with the inline mode)
*/
- if (dst->lwtstate != cache_dst->lwtstate) {
+ if (orig_dst->lwtstate != dst->lwtstate) {
skb_dst_drop(skb);
- skb_dst_set(skb, cache_dst);
+ skb_dst_set(skb, dst);
return dst_output(net, sk, skb);
}
out:
- dst_release(cache_dst);
- return dst->lwtstate->orig_output(net, sk, skb);
+ dst_release(dst);
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
drop:
- dst_release(cache_dst);
+ dst_release(dst);
kfree_skb(skb);
return err;
}
static void ioam6_destroy_state(struct lwtunnel_state *lwt)
{
+ /* Since the refcount of per-cpu dst_entry caches will never be 0 (see
+ * why above) when our "fake" dst_entry is used, it is not necessary to
+ * remove them before calling dst_cache_destroy()
+ */
dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bf727149fdec..7094d7708686 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -249,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
+ struct fib6_table *tb, *new_tb;
if (id == 0)
id = RT6_TABLE_MAIN;
+
tb = fib6_get_table(net, id);
if (tb)
return tb;
- tb = fib6_alloc_table(net, id);
- if (tb)
- fib6_link_table(net, tb);
+ new_tb = fib6_alloc_table(net, id);
+ if (!new_tb)
+ return NULL;
+
+ spin_lock_bh(&net->ipv6.fib_table_hash_lock);
+
+ tb = fib6_get_table(net, id);
+ if (unlikely(tb)) {
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
+ kfree(new_tb);
+ return tb;
+ }
+
+ fib6_link_table(net, new_tb);
+
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
- return tb;
+ return new_tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);
struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
struct hlist_head *head;
- unsigned int h;
+ struct fib6_table *tb;
- if (id == 0)
+ if (!id)
id = RT6_TABLE_MAIN;
- h = id & (FIB6_TABLE_HASHSZ - 1);
- rcu_read_lock();
- head = &net->ipv6.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
- if (tb->tb6_id == id) {
- rcu_read_unlock();
+
+ head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
+
+ /* See comment in fib6_link_table(). RCU is not required,
+ * but rcu_dereference_raw() is used to avoid data-race.
+ */
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
+ if (tb->tb6_id == id)
return tb;
- }
- }
- rcu_read_unlock();
return NULL;
}
@@ -1015,8 +1027,9 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
.table = table
};
- nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
- &arg);
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg);
+ rcu_read_unlock();
} else {
struct fib6_nh *fib6_nh;
@@ -1034,8 +1047,14 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
rt6_flush_exceptions(rt);
fib6_drop_pcpu_from(rt, table);
- if (rt->nh && !list_empty(&rt->nh_list))
- list_del_init(&rt->nh_list);
+ if (rt->nh) {
+ spin_lock(&rt->nh->lock);
+
+ if (!list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
+
+ spin_unlock(&rt->nh->lock);
+ }
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@@ -1069,8 +1088,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
*/
static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
- struct nl_info *info,
- struct netlink_ext_ack *extack)
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -1203,7 +1222,9 @@ next_iter:
fib6_nsiblings++;
}
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+ rcu_read_lock();
rt6_multipath_rebalance(temp_sibling);
+ rcu_read_unlock();
}
/*
@@ -1246,7 +1267,9 @@ add:
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
list_del_rcu(&rt->fib6_siblings);
+ rcu_read_lock();
rt6_multipath_rebalance(next_sibling);
+ rcu_read_unlock();
return err;
}
}
@@ -1294,10 +1317,9 @@ add:
}
nsiblings = iter->fib6_nsiblings;
iter->fib6_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
+ list_add(&iter->purge_link, purge_list);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
- fib6_info_release(iter);
if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
@@ -1310,10 +1332,9 @@ add:
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->fib6_next;
iter->fib6_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
+ list_add(&iter->purge_link, purge_list);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
- fib6_info_release(iter);
nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
@@ -1329,6 +1350,28 @@ add:
return 0;
}
+static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
+{
+ int err;
+
+ spin_lock(&rt->nh->lock);
+
+ if (rt->nh->dead) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
+ if (!err)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
+ }
+
+ spin_unlock(&rt->nh->lock);
+
+ return err;
+}
+
static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
@@ -1383,6 +1426,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
struct nl_info *info, struct netlink_ext_ack *extack)
{
struct fib6_table *table = rt->fib6_table;
+ LIST_HEAD(purge_list);
struct fib6_node *fn;
#ifdef CONFIG_IPV6_SUBTREES
struct fib6_node *pn = NULL;
@@ -1485,10 +1529,19 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, extack);
+ if (rt->nh)
+ err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
+ else
+ err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
if (!err) {
- if (rt->nh)
- list_add(&rt->nh_list, &rt->nh->f6i_list);
+ struct fib6_info *iter, *next;
+
+ list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
+ list_del(&iter->purge_link);
+ fib6_purge_rt(iter, fn, info->nl_net);
+ fib6_info_release(iter);
+ }
+
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
if (rt->fib6_flags & RTF_EXPIRES)
@@ -2423,6 +2476,8 @@ static int __net_init fib6_net_init(struct net *net)
if (!net->ipv6.fib_table_hash)
goto out_rt6_stats;
+ spin_lock_init(&net->ipv6.fib_table_hash_lock);
+
net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
GFP_KERNEL);
if (!net->ipv6.fib6_main_tbl)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 957ca98fa70f..2dc9dcffe2ca 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1570,7 +1570,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = {
.flags = INET6_PROTO_FINAL,
};
-static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
+static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head)
{
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
struct net_device *dev, *aux;
@@ -1587,16 +1587,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
struct ip6_tnl *t;
- t = rtnl_dereference(ign->tunnels[prio][h]);
+ t = rtnl_net_dereference(net, ign->tunnels[prio][h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
- unregister_netdevice_queue(t->dev,
- head);
- t = rtnl_dereference(t->next);
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
}
@@ -1640,19 +1640,9 @@ err_alloc_dev:
return err;
}
-static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
-{
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- ip6gre_destroy_tunnels(net, dev_to_kill);
-}
-
static struct pernet_operations ip6gre_net_ops = {
.init = ip6gre_init_net,
- .exit_batch_rtnl = ip6gre_exit_batch_rtnl,
+ .exit_rtnl = ip6gre_exit_rtnl_net,
.id = &ip6gre_net_id,
.size = sizeof(struct ip6gre_net),
};
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 581bc6289081..7bd29a9ff0db 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -259,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
}
/*
- * xmit an sk_buff (used by TCP, SCTP and DCCP)
+ * xmit an sk_buff (used by TCP and SCTP)
* Note : socket lock is not held for SYNACK packets, but might be modified
* by calls to skb_set_owner_w() and ipv6_local_error(),
* which are using proper atomic operations or spinlocks.
@@ -1524,7 +1524,8 @@ emsgsize:
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a04dd1bb4b19..894d3158a6f0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2210,7 +2210,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
.priority = 1,
};
-static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
+static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list)
{
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct net_device *dev, *aux;
@@ -2222,25 +2222,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head
unregister_netdevice_queue(dev, list);
for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
- t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, list);
- t = rtnl_dereference(t->next);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
- t = rtnl_dereference(ip6n->tnls_wc[0]);
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, list);
- t = rtnl_dereference(t->next);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
@@ -2287,19 +2289,9 @@ err_alloc_dev:
return err;
}
-static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
-{
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- ip6_tnl_destroy_tunnels(net, dev_to_kill);
-}
-
static struct pernet_operations ip6_tnl_net_ops = {
.init = ip6_tnl_init_net,
- .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl,
+ .exit_rtnl = ip6_tnl_exit_rtnl_net,
.id = &ip6_tnl_net_id,
.size = sizeof(struct ip6_tnl_net),
};
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 09ec4b0ad7dc..40464a88bca6 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1112,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
.get_link_net = ip6_tnl_get_link_net,
};
-static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n,
- struct list_head *list)
+static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list)
{
- int h;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
struct ip6_tnl *t;
+ int h;
for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
- t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
while (t) {
unregister_netdevice_queue(t->dev, list);
- t = rtnl_dereference(t->next);
+ t = rtnl_net_dereference(net, t->next);
}
}
- t = rtnl_dereference(ip6n->tnls_wc[0]);
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
if (t)
unregister_netdevice_queue(t->dev, list);
}
@@ -1170,22 +1170,9 @@ err_alloc_dev:
return err;
}
-static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
-{
- struct vti6_net *ip6n;
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list) {
- ip6n = net_generic(net, vti6_net_id);
- vti6_destroy_tunnels(ip6n, dev_to_kill);
- }
-}
-
static struct pernet_operations vti6_net_ops = {
.init = vti6_init_net,
- .exit_batch_rtnl = vti6_exit_batch_rtnl,
+ .exit_rtnl = vti6_exit_rtnl_net,
.id = &vti6_net_id,
.size = sizeof(struct vti6_net),
};
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 581ce055bf52..4541836ee3da 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -164,20 +164,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
- if (first_len - hlen > mtu ||
- skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ if (first_len - hlen > mtu)
goto blackhole;
- if (skb_cloned(skb))
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
goto slow_path;
skb_walk_frags(skb, frag2) {
- if (frag2->len > mtu ||
- skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ if (frag2->len > mtu)
goto blackhole;
/* Partially cloned skb? */
- if (skb_shared(frag2))
+ if (skb_shared(frag2) ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
goto slow_path;
}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d5602950ae7..d585ac3c1113 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 0c39c77fe8a8..b903c62c00c9 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in6_addr *gw, int oif)
{
local_bh_disable();
- if (this_cpu_read(nf_skb_duplicated))
+ if (current->in_nf_duplicate)
goto out;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->hop_limit;
}
if (nf_dup_ipv6_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip6_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 7fd9d7b21cd4..421036a3605b 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
fl6->flowi6_mark = pkt->skb->mark;
fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
+ fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
return lookup_flags;
}
@@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
else if (priv->flags & NFTA_FIB_F_OIF)
dev = nft_out(pkt);
- fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev);
-
nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
@@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
{
const struct nft_fib *priv = nft_expr_priv(expr);
int noff = skb_network_offset(pkt->skb);
+ const struct net_device *found = NULL;
const struct net_device *oif = NULL;
u32 *dest = &regs->data[priv->dreg];
struct ipv6hdr *iph, _iph;
@@ -165,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
.flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
- .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),
};
struct rt6_info *rt;
int lookup_flags;
@@ -203,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
goto put_rt_err;
- if (oif && oif != rt->rt6i_idev->dev &&
- l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex)
- goto put_rt_err;
+ if (!oif) {
+ found = rt->rt6i_idev->dev;
+ } else {
+ if (oif == rt->rt6i_idev->dev ||
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
+ found = oif;
+ }
- nft_fib_store_result(dest, priv, rt->rt6i_idev->dev);
+ nft_fib_store_result(dest, priv, found);
put_rt_err:
ip6_rt_put(rt);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 96f1621e2381..0143262094b0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
void rt6_flush_exceptions(struct fib6_info *f6i)
{
- if (f6i->nh)
- nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
- f6i);
- else
+ if (f6i->nh) {
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
+ rcu_read_unlock();
+ } else {
fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+ }
}
/* Find cached rt in the hash table inside passed in rt
@@ -2492,8 +2494,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
hash_keys.basic.ip_proto = fl6->flowi6_proto;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
- if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
- hash_keys.ports.src = fl6->fl6_sport;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
+ }
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl6->fl6_dport;
@@ -2547,7 +2553,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
- hash_keys.ports.src = fl6->fl6_sport;
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
hash_keys.ports.dst = fl6->fl6_dport;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
@@ -3729,61 +3738,14 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
}
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
- gfp_t gfp_flags,
- struct netlink_ext_ack *extack)
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
struct net *net = cfg->fc_nlinfo.nl_net;
- struct fib6_info *rt = NULL;
- struct nexthop *nh = NULL;
struct fib6_table *table;
- struct fib6_nh *fib6_nh;
- int err = -EINVAL;
- int addr_type;
-
- /* RTF_PCPU is an internal flag; can not be set by userspace */
- if (cfg->fc_flags & RTF_PCPU) {
- NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
- goto out;
- }
-
- /* RTF_CACHE is an internal flag; can not be set by userspace */
- if (cfg->fc_flags & RTF_CACHE) {
- NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
- goto out;
- }
-
- if (cfg->fc_type > RTN_MAX) {
- NL_SET_ERR_MSG(extack, "Invalid route type");
- goto out;
- }
-
- if (cfg->fc_dst_len > 128) {
- NL_SET_ERR_MSG(extack, "Invalid prefix length");
- goto out;
- }
- if (cfg->fc_src_len > 128) {
- NL_SET_ERR_MSG(extack, "Invalid source address length");
- goto out;
- }
-#ifndef CONFIG_IPV6_SUBTREES
- if (cfg->fc_src_len) {
- NL_SET_ERR_MSG(extack,
- "Specifying source address requires IPV6_SUBTREES to be enabled");
- goto out;
- }
-#endif
- if (cfg->fc_nh_id) {
- nh = nexthop_find_by_id(net, cfg->fc_nh_id);
- if (!nh) {
- NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
- goto out;
- }
- err = fib6_check_nexthop(nh, cfg, extack);
- if (err)
- goto out;
- }
+ struct fib6_info *rt;
+ int err;
- err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
!(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
table = fib6_get_table(net, cfg->fc_table);
@@ -3794,22 +3756,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
} else {
table = fib6_new_table(net, cfg->fc_table);
}
+ if (!table) {
+ err = -ENOBUFS;
+ goto err;
+ }
- if (!table)
- goto out;
-
- err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags, !nh);
- if (!rt)
- goto out;
+ rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
+ if (!rt) {
+ err = -ENOMEM;
+ goto err;
+ }
rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
extack);
if (IS_ERR(rt->fib6_metrics)) {
err = PTR_ERR(rt->fib6_metrics);
- /* Do not leave garbage there. */
- rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
- goto out_free;
+ goto free;
}
if (cfg->fc_flags & RTF_ADDRCONF)
@@ -3817,12 +3779,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_EXPIRES)
fib6_set_expires(rt, jiffies +
- clock_t_to_jiffies(cfg->fc_expires));
+ clock_t_to_jiffies(cfg->fc_expires));
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
- rt->fib6_protocol = cfg->fc_protocol;
+ rt->fib6_protocol = cfg->fc_protocol;
rt->fib6_table = table;
rt->fib6_metric = cfg->fc_metric;
rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
@@ -3835,23 +3797,54 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
- if (nh) {
- if (rt->fib6_src.plen) {
- NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ return rt;
+free:
+ kfree(rt);
+err:
+ return ERR_PTR(err);
+}
+
+static int ip6_route_info_create_nh(struct fib6_info *rt,
+ struct fib6_config *cfg,
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct fib6_nh *fib6_nh;
+ int err;
+
+ if (cfg->fc_nh_id) {
+ struct nexthop *nh;
+
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
goto out_free;
}
+
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out_free;
+
if (!nexthop_get(nh)) {
NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
err = -ENOENT;
goto out_free;
}
+
rt->nh = nh;
fib6_nh = nexthop_fib6_nh(rt->nh);
+
+ rcu_read_unlock();
} else {
+ int addr_type;
+
err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
if (err)
- goto out;
+ goto out_release;
fib6_nh = rt->fib6_nh;
@@ -3870,21 +3863,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
err = -EINVAL;
- goto out;
+ goto out_release;
}
rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
rt->fib6_prefsrc.plen = 128;
- } else
- rt->fib6_prefsrc.plen = 0;
+ }
- return rt;
-out:
+ return 0;
+out_release:
fib6_info_release(rt);
- return ERR_PTR(err);
+ return err;
out_free:
+ rcu_read_unlock();
ip_fib_metrics_put(rt->fib6_metrics);
kfree(rt);
- return ERR_PTR(err);
+ return err;
}
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
@@ -3897,6 +3890,10 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
if (IS_ERR(rt))
return PTR_ERR(rt);
+ err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack);
+ if (err)
+ return err;
+
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
fib6_info_release(rt);
@@ -4125,9 +4122,9 @@ static int ip6_route_del(struct fib6_config *cfg,
if (rt->nh) {
if (!fib6_info_hold_safe(rt))
continue;
- rcu_read_unlock();
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ break;
}
if (cfg->fc_nh_id)
continue;
@@ -4142,13 +4139,13 @@ static int ip6_route_del(struct fib6_config *cfg,
continue;
if (!fib6_info_hold_safe(rt))
continue;
- rcu_read_unlock();
/* if gateway was specified only delete the one hop */
if (cfg->fc_flags & RTF_GATEWAY)
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
-
- return __ip6_del_rt_siblings(rt, cfg);
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ else
+ err = __ip6_del_rt_siblings(rt, cfg);
+ break;
}
}
rcu_read_unlock();
@@ -4482,6 +4479,53 @@ void rt6_purge_dflt_routers(struct net *net)
rcu_read_unlock();
}
+static int fib6_config_validate(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ /* RTF_PCPU is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_PCPU) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
+ goto errout;
+ }
+
+ /* RTF_CACHE is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_CACHE) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+ goto errout;
+ }
+
+ if (cfg->fc_type > RTN_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ goto errout;
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (cfg->fc_src_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid source address length");
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id && cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto errout;
+ }
+#else
+ if (cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack,
+ "Specifying source address requires IPV6_SUBTREES to be enabled");
+ goto errout;
+ }
+#endif
+ return 0;
+errout:
+ return -EINVAL;
+}
+
static void rtmsg_to_fib6_config(struct net *net,
struct in6_rtmsg *rtmsg,
struct fib6_config *cfg)
@@ -4517,9 +4561,12 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
rtmsg_to_fib6_config(net, rtmsg, &cfg);
- rtnl_lock();
switch (cmd) {
case SIOCADDRT:
+ err = fib6_config_validate(&cfg, NULL);
+ if (err)
+ break;
+
/* Only do the default setting of fc_metric in route adding */
if (cfg.fc_metric == 0)
cfg.fc_metric = IP6_RT_PRIO_USER;
@@ -4529,7 +4576,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
err = ip6_route_del(&cfg, NULL);
break;
}
- rtnl_unlock();
+
return err;
}
@@ -4619,6 +4666,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
.fc_ignore_dev_down = true,
};
struct fib6_info *f6i;
+ int err;
if (anycast) {
cfg.fc_type = RTN_ANYCAST;
@@ -4629,14 +4677,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
}
f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
- if (!IS_ERR(f6i)) {
- f6i->dst_nocount = true;
+ if (IS_ERR(f6i))
+ return f6i;
- if (!anycast &&
- (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
- READ_ONCE(idev->cnf.disable_policy)))
- f6i->dst_nopolicy = true;
- }
+ err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
+ READ_ONCE(idev->cnf.disable_policy)))
+ f6i->dst_nopolicy = true;
return f6i;
}
@@ -5051,12 +5104,60 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_FLOWLABEL] = { .type = NLA_BE32 },
};
+static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack,
+ bool newroute)
+{
+ struct rtnexthop *rtnh;
+ int remaining;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ if (!rtnh_ok(rtnh, remaining)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
+ do {
+ bool has_gateway = cfg->fc_flags & RTF_GATEWAY;
+ int attrlen = rtnh_attrlen(rtnh);
+
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs;
+
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ if (nla_len(nla) < sizeof(cfg->fc_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid IPv6 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ has_gateway = true;
+ }
+ }
+
+ if (newroute && (cfg->fc_nh_id || !has_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ return -EINVAL;
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } while (rtnh_ok(rtnh, remaining));
+
+ return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack);
+}
+
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rtmsg *rtm;
+ bool newroute = nlh->nlmsg_type == RTM_NEWROUTE;
struct nlattr *tb[RTA_MAX+1];
+ struct rtmsg *rtm;
unsigned int pref;
int err;
@@ -5165,9 +5266,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
- err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
- cfg->fc_mp_len,
- extack, true);
+ err = rtm_to_fib6_multipath_config(cfg, extack, newroute);
if (err < 0)
goto errout;
}
@@ -5186,8 +5285,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_ENCAP_TYPE]) {
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
- err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
- extack, true);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
if (err < 0)
goto errout;
}
@@ -5209,29 +5307,28 @@ errout:
struct rt6_nh {
struct fib6_info *fib6_info;
struct fib6_config r_cfg;
- struct list_head next;
+ struct list_head list;
};
-static int ip6_route_info_append(struct net *net,
- struct list_head *rt6_nh_list,
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
struct fib6_info *rt,
struct fib6_config *r_cfg)
{
struct rt6_nh *nh;
- int err = -EEXIST;
- list_for_each_entry(nh, rt6_nh_list, next) {
+ list_for_each_entry(nh, rt6_nh_list, list) {
/* check if fib6_info already exists */
if (rt6_duplicate_nexthop(nh->fib6_info, rt))
- return err;
+ return -EEXIST;
}
nh = kzalloc(sizeof(*nh), GFP_KERNEL);
if (!nh)
return -ENOMEM;
+
nh->fib6_info = rt;
memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
- list_add_tail(&nh->next, rt6_nh_list);
+ list_add_tail(&nh->list, rt6_nh_list);
return 0;
}
@@ -5287,37 +5384,26 @@ out:
return should_notify;
}
-static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
- struct netlink_ext_ack *extack)
-{
- if (nla_len(nla) < sizeof(*gw)) {
- NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
- return -EINVAL;
- }
-
- *gw = nla_get_in6_addr(nla);
-
- return 0;
-}
-
static int ip6_route_multipath_add(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
+ struct rt6_nh *nh, *nh_safe;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
- struct fib6_info *rt;
+ LIST_HEAD(rt6_nh_list);
struct rt6_nh *err_nh;
- struct rt6_nh *nh, *nh_safe;
+ struct fib6_info *rt;
__u16 nlflags;
int remaining;
int attrlen;
- int err = 1;
+ int replace;
int nhn = 0;
- int replace = (cfg->fc_nlinfo.nlh &&
- (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
- LIST_HEAD(rt6_nh_list);
+ int err;
+
+ replace = (cfg->fc_nlinfo.nlh &&
+ (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
@@ -5340,18 +5426,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
- extack);
- if (err)
- goto cleanup;
-
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
- r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
- /* RTA_ENCAP_TYPE length checked in
- * lwtunnel_valid_encap_type_attr
- */
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla)
r_cfg.fc_encap_type = nla_get_u16(nla);
@@ -5364,18 +5443,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rt = NULL;
goto cleanup;
}
- if (!rt6_qualify_for_ecmp(rt)) {
- err = -EINVAL;
- NL_SET_ERR_MSG(extack,
- "Device only routes can not be added for IPv6 using the multipath API.");
- fib6_info_release(rt);
+
+ err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack);
+ if (err) {
+ rt = NULL;
goto cleanup;
}
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
- err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
- rt, &r_cfg);
+ err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
fib6_info_release(rt);
goto cleanup;
@@ -5384,12 +5461,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = rtnh_next(rtnh, &remaining);
}
- if (list_empty(&rt6_nh_list)) {
- NL_SET_ERR_MSG(extack,
- "Invalid nexthop configuration - no valid nexthops");
- return -EINVAL;
- }
-
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
* the full route when done
@@ -5402,7 +5473,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
info->skip_notify_kernel = 1;
err_nh = NULL;
- list_for_each_entry(nh, &rt6_nh_list, next) {
+ list_for_each_entry(nh, &rt6_nh_list, list) {
err = __ip6_ins_rt(nh->fib6_info, info, extack);
if (err) {
@@ -5470,16 +5541,16 @@ add_errout:
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
/* Delete routes that were already added */
- list_for_each_entry(nh, &rt6_nh_list, next) {
+ list_for_each_entry(nh, &rt6_nh_list, list) {
if (err_nh == nh)
break;
ip6_route_del(&nh->r_cfg, extack);
}
cleanup:
- list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
+ list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) {
fib6_info_release(nh->fib6_info);
- list_del(&nh->next);
+ list_del(&nh->list);
kfree(nh);
}
@@ -5511,21 +5582,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
- extack);
- if (err) {
- last_err = err;
- goto next_rtnh;
- }
-
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
}
+
err = ip6_route_del(&r_cfg, extack);
if (err)
last_err = err;
-next_rtnh:
rtnh = rtnh_next(rtnh, &remaining);
}
@@ -5542,15 +5607,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (cfg.fc_nh_id &&
- !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
- NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
- return -EINVAL;
+ if (cfg.fc_nh_id) {
+ rcu_read_lock();
+ err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id);
+ rcu_read_unlock();
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
}
- if (cfg.fc_mp)
+ if (cfg.fc_mp) {
return ip6_route_multipath_del(&cfg, extack);
- else {
+ } else {
cfg.fc_delete_all_nh = 1;
return ip6_route_del(&cfg, extack);
}
@@ -5566,6 +5636,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ err = fib6_config_validate(&cfg, extack);
+ if (err)
+ return err;
+
if (cfg.fc_metric == 0)
cfg.fc_metric = IP6_RT_PRIO_USER;
@@ -6030,7 +6104,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG_MOD(extack,
"Invalid header for get route request");
return -EINVAL;
@@ -6040,7 +6115,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_ipv6_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
(rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
@@ -6255,6 +6329,8 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ rcu_read_lock();
+
skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -6267,10 +6343,14 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
kfree_skb(skb);
goto errout;
}
+
+ rcu_read_unlock();
+
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
info->nlh, GFP_ATOMIC);
return;
errout:
+ rcu_read_unlock();
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
@@ -6760,9 +6840,9 @@ static void bpf_iter_unregister(void)
static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
- .doit = inet6_rtm_newroute},
+ .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
- .doit = inet6_rtm_delroute},
+ .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
.doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index bbf5b84a70fc..f78ecb6ad838 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -40,7 +40,14 @@
#include <net/seg6_hmac.h>
#include <linux/random.h>
-static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
+struct hmac_storage {
+ local_lock_t bh_lock;
+ char hmac_ring[SEG6_HMAC_RING_SIZE];
+};
+
+static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
{
@@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
*/
local_bh_disable();
- ring = this_cpu_ptr(hmac_ring);
+ local_lock_nested_bh(&hmac_storage.bh_lock);
+ ring = this_cpu_ptr(hmac_storage.hmac_ring);
off = ring;
/* source address */
@@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
SEG6_HMAC_MAX_DIGESTSIZE);
+ local_unlock_nested_bh(&hmac_storage.bh_lock);
local_bh_enable();
if (dgsize < 0)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 9a0f32acb750..a72dbca9e8fc 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1804,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = {
};
#endif
-static void __net_exit sit_destroy_tunnels(struct net *net,
- struct list_head *head)
+static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head)
{
struct sit_net *sitn = net_generic(net, sit_net_id);
struct net_device *dev, *aux;
@@ -1820,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) {
struct ip_tunnel *t;
- t = rtnl_dereference(sitn->tunnels[prio][h]);
+ t = rtnl_net_dereference(net, sitn->tunnels[prio][h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
- unregister_netdevice_queue(t->dev,
- head);
- t = rtnl_dereference(t->next);
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
}
@@ -1881,19 +1880,9 @@ err_alloc_dev:
return err;
}
-static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
-{
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- sit_destroy_tunnels(net, dev_to_kill);
-}
-
static struct pernet_operations sit_net_ops = {
.init = sit_init_net,
- .exit_batch_rtnl = sit_exit_batch_rtnl,
+ .exit_rtnl = sit_exit_rtnl_net,
.id = &sit_net_id,
.size = sizeof(struct sit_net),
};
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b03c223eda4f..e8e68a142649 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
+ fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
@@ -1970,7 +1972,8 @@ do_time_wait:
goto csum_error;
}
- tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn);
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
switch (tw_status) {
case TCP_TW_SYN:
{
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 024458ef163c..7317f8e053f1 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -46,6 +46,7 @@
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
#include <net/ip6_tunnel.h>
+#include <net/udp_tunnel.h>
#include <net/xfrm.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
@@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk)
if (udp_test_bit(ENCAP_ENABLED, sk)) {
static_branch_dec(&udpv6_encap_needed_key);
udp_encap_disable();
+ udp_tunnel_cleanup_gro(sk);
}
}
}
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 404212dfc99a..d8445ac1b2e4 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
int iif, sdif;
+ sk = udp_tunnel_sk(net, true);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
inet6_get_iif_sdif(skb, &iif, &sdif);
return __udp6_lib_lookup(net, &iph->saddr, sport,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c56bb4f451e6..efc2a91f4c48 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2630,7 +2630,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
}
return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
- kma ? &k : NULL, net, NULL, 0, NULL);
+ kma ? &k : NULL, net, NULL, 0, NULL, NULL);
out:
return err;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 9f683f838431..d9d88f2f2831 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -146,8 +146,8 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata,
struct ieee80211_bss_conf *link_conf)
{
struct ieee80211_sub_if_data *tx_sdata;
+ struct ieee80211_bss_conf *old;
- sdata->vif.mbssid_tx_vif = NULL;
link_conf->bssid_index = 0;
link_conf->nontransmitted = false;
link_conf->ema_ap = false;
@@ -156,14 +156,26 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev)
return -EINVAL;
+ old = sdata_dereference(link_conf->tx_bss_conf, sdata);
+ if (old)
+ return -EALREADY;
+
tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev);
if (!tx_sdata)
return -EINVAL;
if (tx_sdata == sdata) {
- sdata->vif.mbssid_tx_vif = &sdata->vif;
+ rcu_assign_pointer(link_conf->tx_bss_conf, link_conf);
} else {
- sdata->vif.mbssid_tx_vif = &tx_sdata->vif;
+ struct ieee80211_bss_conf *tx_bss_conf;
+
+ tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id],
+ sdata);
+ if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf)
+ return -EINVAL;
+
+ rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf);
+
link_conf->nontransmitted = true;
link_conf->bssid_index = params->index;
}
@@ -1278,9 +1290,9 @@ static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata)
sdata->vif.type != NL80211_IFTYPE_P2P_GO)
return num;
- if (!sdata->vif.valid_links)
- return num;
-
+ /* non-MLO mode of operation also uses link_id 0 in sdata so it is
+ * safe to directly proceed with the below loop
+ */
for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
link = sdata_dereference(sdata->link[link_id], sdata);
if (!link)
@@ -1409,6 +1421,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
(IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ);
+ link_conf->eht_disable_mcs15 =
+ u8_get_bits(params->eht_oper->params,
+ IEEE80211_EHT_OPER_MCS15_DISABLE);
} else {
link_conf->eht_su_beamformer = false;
link_conf->eht_su_beamformee = false;
@@ -1669,7 +1684,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
kfree(link_conf->ftmr_params);
link_conf->ftmr_params = NULL;
- sdata->vif.mbssid_tx_vif = NULL;
link_conf->bssid_index = 0;
link_conf->nontransmitted = false;
link_conf->ema_ap = false;
@@ -1683,6 +1697,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
ieee80211_free_key_list(local, &keys);
}
+ ieee80211_stop_mbssid(sdata);
+ RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL);
+
link_conf->enable_beacon = false;
sdata->beacon_rate_set = false;
sdata->vif.cfg.ssid_len = 0;
@@ -2066,6 +2083,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
if (params->listen_interval >= 0)
sta->listen_interval = params->listen_interval;
+ if (params->eml_cap_present)
+ sta->sta.eml_cap = params->eml_cap;
+
ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY,
&params->link_sta_params);
if (ret)
@@ -2904,7 +2924,7 @@ static int ieee80211_scan(struct wiphy *wiphy,
* the frames sent while scanning on other channel will be
* lost)
*/
- if (sdata->deflink.u.ap.beacon &&
+ if (ieee80211_num_beaconing_links(sdata) &&
(!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
!(req->flags & NL80211_SCAN_FLAG_AP)))
return -EOPNOTSUPP;
@@ -3700,6 +3720,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss_conf *tx_bss_conf;
struct ieee80211_link_data *link_data;
if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
@@ -3713,25 +3734,24 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id)
return;
}
- /* TODO: MBSSID with MLO changes */
- if (vif->mbssid_tx_vif == vif) {
+ tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf);
+ if (tx_bss_conf == link_data->conf) {
/* Trigger ieee80211_csa_finish() on the non-transmitting
* interfaces when channel switch is received on
* transmitting interface
*/
- struct ieee80211_sub_if_data *iter;
-
- list_for_each_entry_rcu(iter, &local->interfaces, list) {
- if (!ieee80211_sdata_running(iter))
- continue;
+ struct ieee80211_link_data *iter;
- if (iter == sdata || iter->vif.mbssid_tx_vif != vif)
+ for_each_sdata_link(local, iter) {
+ if (iter->sdata == sdata ||
+ rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf)
continue;
- wiphy_work_queue(iter->local->hw.wiphy,
- &iter->deflink.csa.finalize_work);
+ wiphy_work_queue(iter->sdata->local->hw.wiphy,
+ &iter->csa.finalize_work);
}
}
+
wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work);
rcu_read_unlock();
@@ -4833,17 +4853,19 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link,
ieee80211_link_info_change_notify(sdata, link, changed);
- if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) {
- struct ieee80211_sub_if_data *child;
+ if (!link->conf->nontransmitted &&
+ rcu_access_pointer(link->conf->tx_bss_conf)) {
+ struct ieee80211_link_data *tmp;
- list_for_each_entry(child, &sdata->local->interfaces, list) {
- if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) {
- child->vif.bss_conf.he_bss_color.color = color;
- child->vif.bss_conf.he_bss_color.enabled = enable;
- ieee80211_link_info_change_notify(child,
- &child->deflink,
- BSS_CHANGED_HE_BSS_COLOR);
- }
+ for_each_sdata_link(sdata->local, tmp) {
+ if (tmp->sdata == sdata ||
+ rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf)
+ continue;
+
+ tmp->conf->he_bss_color.color = color;
+ tmp->conf->he_bss_color.enabled = enable;
+ ieee80211_link_info_change_notify(tmp->sdata, tmp,
+ BSS_CHANGED_HE_BSS_COLOR);
}
}
}
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index c3bfac58151f..3aaf5abf1acc 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -2131,6 +2131,9 @@ void ieee80211_link_release_channel(struct ieee80211_link_data *link)
{
struct ieee80211_sub_if_data *sdata = link->sdata;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ return;
+
lockdep_assert_wiphy(sdata->local->hw.wiphy);
if (rcu_access_pointer(link->conf->chanctx_conf))
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index a8948f4d983e..49061bd4151b 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -152,12 +152,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
p += scnprintf(p,
bufsz + buf - p,
- "target %uus interval %uus ecn %s\n",
- codel_time_to_us(sta->cparams.target),
- codel_time_to_us(sta->cparams.interval),
- sta->cparams.ecn ? "yes" : "no");
- p += scnprintf(p,
- bufsz + buf - p,
"tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 4246d168374f..a6e7b7ba6a01 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -48,7 +48,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt;
u8 *pos;
struct ieee80211_supported_band *sband;
- u32 rate_flags, rates = 0, rates_added = 0;
+ u32 rates = 0, rates_added = 0;
struct beacon_data *presp;
int frame_len;
@@ -90,14 +90,11 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
pos += ifibss->ssid_len;
sband = local->hw.wiphy->bands[chandef->chan->band];
- rate_flags = ieee80211_chandef_rate_flags(chandef);
rates_n = 0;
if (have_higher_than_11mbit)
*have_higher_than_11mbit = false;
for (i = 0; i < sband->n_bitrates; i++) {
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- continue;
if (sband->bitrates[i].bitrate > 110 &&
have_higher_than_11mbit)
*have_higher_than_11mbit = true;
@@ -395,7 +392,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
const struct cfg80211_bss_ies *ies;
enum nl80211_channel_type chan_type;
u64 tsf;
- u32 rate_flags;
lockdep_assert_wiphy(sdata->local->hw.wiphy);
@@ -429,7 +425,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
}
sband = sdata->local->hw.wiphy->bands[cbss->channel->band];
- rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef);
basic_rates = 0;
@@ -439,9 +434,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
for (j = 0; j < sband->n_bitrates; j++) {
int brate;
- if ((rate_flags & sband->bitrates[j].flags)
- != rate_flags)
- continue;
brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5);
if (brate == rate) {
@@ -1717,12 +1709,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
struct cfg80211_ibss_params *params)
{
u64 changed = 0;
- u32 rate_flags;
- struct ieee80211_supported_band *sband;
enum ieee80211_chanctx_mode chanmode;
struct ieee80211_local *local = sdata->local;
int radar_detect_width = 0;
- int i;
int ret;
lockdep_assert_wiphy(local->hw.wiphy);
@@ -1765,12 +1754,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
sdata->u.ibss.last_scan_completed = jiffies;
/* fix basic_rates if channel does not support these rates */
- rate_flags = ieee80211_chandef_rate_flags(&params->chandef);
- sband = local->hw.wiphy->bands[params->chandef.chan->band];
- for (i = 0; i < sband->n_bitrates; i++) {
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- sdata->u.ibss.basic_rates &= ~BIT(i);
- }
memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate,
sizeof(params->mcast_rate));
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index fb05f3cd37ec..30809f0b35f7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1226,6 +1226,15 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
if ((_link = wiphy_dereference((_local)->hw.wiphy, \
___sdata->link[___link_id])))
+#define for_each_link_data(sdata, __link) \
+ struct ieee80211_sub_if_data *__sdata = sdata; \
+ for (int __link_id = 0; \
+ __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \
+ if ((!(__sdata)->vif.valid_links || \
+ (__sdata)->vif.valid_links & BIT(__link_id)) && \
+ ((__link) = sdata_dereference((__sdata)->link[__link_id], \
+ (__sdata))))
+
static inline int
ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems,
struct cfg80211_rnr_elems *rnr_elems,
@@ -2078,6 +2087,9 @@ static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata
ieee80211_vif_set_links(sdata, 0, 0);
}
+void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata);
+void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata);
+
/* tx handling */
void ieee80211_clear_tx_pending(struct ieee80211_local *local);
void ieee80211_tx_pending(struct tasklet_struct *t);
@@ -2613,7 +2625,7 @@ void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
/* element building in SKBs */
int ieee80211_put_srates_elem(struct sk_buff *skb,
const struct ieee80211_supported_band *sband,
- u32 basic_rates, u32 rate_flags, u32 masked_rates,
+ u32 basic_rates, u32 masked_rates,
u8 element_id);
int ieee80211_put_he_cap(struct sk_buff *skb,
struct ieee80211_sub_if_data *sdata,
@@ -2795,6 +2807,8 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata,
void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt, size_t len);
+void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata);
+
#if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_MAC80211_KUNIT
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 969b3e2c496a..7c27f3cd841c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -485,6 +485,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
case NL80211_IFTYPE_MONITOR:
list_del_rcu(&sdata->u.mntr.list);
break;
+ case NL80211_IFTYPE_AP_VLAN:
+ ieee80211_apvlan_link_clear(sdata);
+ break;
default:
break;
}
@@ -729,30 +732,59 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
ieee80211_add_virtual_monitor(local);
}
-static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata)
+void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata)
{
- struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata;
- struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif;
+ struct ieee80211_sub_if_data *tx_sdata;
+ struct ieee80211_bss_conf *link_conf, *tx_bss_conf;
+ struct ieee80211_link_data *tx_link, *link;
+ unsigned int link_id;
- if (!tx_vif)
- return;
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ /* Check if any of the links of current sdata is an MBSSID. */
+ for_each_vif_active_link(&sdata->vif, link_conf, link_id) {
+ tx_bss_conf = sdata_dereference(link_conf->tx_bss_conf, sdata);
+ if (!tx_bss_conf)
+ continue;
+
+ tx_sdata = vif_to_sdata(tx_bss_conf->vif);
+ RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL);
- tx_sdata = vif_to_sdata(tx_vif);
- sdata->vif.mbssid_tx_vif = NULL;
+ /* If we are not tx sdata reset tx sdata's tx_bss_conf to avoid recusrion
+ * while closing tx sdata at the end of outer loop below.
+ */
+ if (sdata != tx_sdata) {
+ tx_link = sdata_dereference(tx_sdata->link[tx_bss_conf->link_id],
+ tx_sdata);
+ if (!tx_link)
+ continue;
- list_for_each_entry_safe(non_tx_sdata, tmp_sdata,
- &tx_sdata->local->interfaces, list) {
- if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata &&
- non_tx_sdata->vif.mbssid_tx_vif == tx_vif &&
- ieee80211_sdata_running(non_tx_sdata)) {
- non_tx_sdata->vif.mbssid_tx_vif = NULL;
- dev_close(non_tx_sdata->wdev.netdev);
+ RCU_INIT_POINTER(tx_link->conf->tx_bss_conf, NULL);
}
- }
- if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) {
- tx_sdata->vif.mbssid_tx_vif = NULL;
- dev_close(tx_sdata->wdev.netdev);
+ /* loop through sdatas to find if any of their links
+ * belong to same MBSSID set as the one getting deleted.
+ */
+ for_each_sdata_link(tx_sdata->local, link) {
+ struct ieee80211_sub_if_data *link_sdata = link->sdata;
+
+ if (link_sdata == sdata || link_sdata == tx_sdata ||
+ rcu_access_pointer(link->conf->tx_bss_conf) != tx_bss_conf)
+ continue;
+
+ RCU_INIT_POINTER(link->conf->tx_bss_conf, NULL);
+
+ /* Remove all links of matching MLD until dynamic link
+ * removal can be supported.
+ */
+ cfg80211_stop_iface(link_sdata->wdev.wiphy, &link_sdata->wdev,
+ GFP_KERNEL);
+ }
+
+ /* If we are not tx sdata, remove links of tx sdata and proceed */
+ if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata))
+ cfg80211_stop_iface(tx_sdata->wdev.wiphy,
+ &tx_sdata->wdev, GFP_KERNEL);
}
}
@@ -760,21 +792,25 @@ static int ieee80211_stop(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- /* close dependent VLAN and MBSSID interfaces before locking wiphy */
+ /* close dependent VLAN interfaces before locking wiphy */
if (sdata->vif.type == NL80211_IFTYPE_AP) {
struct ieee80211_sub_if_data *vlan, *tmpsdata;
list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
u.vlan.list)
dev_close(vlan->dev);
-
- ieee80211_stop_mbssid(sdata);
}
guard(wiphy)(sdata->local->hw.wiphy);
wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work);
+ /* Close the dependent MBSSID interfaces with wiphy lock as we may be
+ * terminating its partner links too in case of MLD.
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ ieee80211_stop_mbssid(sdata);
+
ieee80211_do_stop(sdata, true);
return 0;
@@ -1268,6 +1304,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
sdata->crypto_tx_tailroom_needed_cnt +=
master->crypto_tx_tailroom_needed_cnt;
+ ieee80211_apvlan_link_setup(sdata);
+
break;
}
case NL80211_IFTYPE_AP:
@@ -1324,7 +1362,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
case NL80211_IFTYPE_AP_VLAN:
/* no need to tell driver, but set carrier and chanctx */
if (sdata->bss->active) {
- ieee80211_link_vlan_copy_chanctx(&sdata->deflink);
+ struct ieee80211_link_data *link;
+
+ for_each_link_data(sdata, link) {
+ ieee80211_link_vlan_copy_chanctx(link);
+ }
+
netif_carrier_on(dev);
ieee80211_set_vif_encap_ops(sdata);
} else {
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 58a76bcd6ae6..d40c2bd3b50b 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -12,6 +12,71 @@
#include "key.h"
#include "debugfs_netdev.h"
+static void ieee80211_update_apvlan_links(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_sub_if_data *vlan;
+ struct ieee80211_link_data *link;
+ u16 ap_bss_links = sdata->vif.valid_links;
+ u16 new_links, vlan_links;
+ unsigned long add;
+
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+ int link_id;
+
+ if (!vlan)
+ continue;
+
+ /* No support for 4addr with MLO yet */
+ if (vlan->wdev.use_4addr)
+ return;
+
+ vlan_links = vlan->vif.valid_links;
+
+ new_links = ap_bss_links;
+
+ add = new_links & ~vlan_links;
+ if (!add)
+ continue;
+
+ ieee80211_vif_set_links(vlan, add, 0);
+
+ for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
+ link = sdata_dereference(vlan->link[link_id], vlan);
+ ieee80211_link_vlan_copy_chanctx(link);
+ }
+ }
+}
+
+void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_sub_if_data *ap_bss = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ u16 new_links = ap_bss->vif.valid_links;
+ unsigned long add;
+ int link_id;
+
+ if (!ap_bss->vif.valid_links)
+ return;
+
+ add = new_links;
+ for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
+ sdata->wdev.valid_links |= BIT(link_id);
+ ether_addr_copy(sdata->wdev.links[link_id].addr,
+ ap_bss->wdev.links[link_id].addr);
+ }
+
+ ieee80211_vif_set_links(sdata, new_links, 0);
+}
+
+void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata)
+{
+ if (!sdata->wdev.valid_links)
+ return;
+
+ sdata->wdev.valid_links = 0;
+ ieee80211_vif_clear_links(sdata);
+}
+
void ieee80211_link_setup(struct ieee80211_link_data *link)
{
if (link->sdata->vif.type == NL80211_IFTYPE_STATION)
@@ -31,6 +96,17 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf);
rcu_assign_pointer(sdata->link[link_id], link);
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ struct ieee80211_sub_if_data *ap_bss;
+ struct ieee80211_bss_conf *ap_bss_conf;
+
+ ap_bss = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ ap_bss_conf = sdata_dereference(ap_bss->vif.link_conf[link_id],
+ ap_bss);
+ memcpy(link_conf, ap_bss_conf, sizeof(*link_conf));
+ }
+
link->sdata = sdata;
link->link_id = link_id;
link->conf = link_conf;
@@ -54,6 +130,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
if (!deflink) {
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
ether_addr_copy(link_conf->addr,
sdata->wdev.links[link_id].addr);
link_conf->bssid = link_conf->addr;
@@ -177,6 +254,7 @@ static void ieee80211_set_vif_links_bitmaps(struct ieee80211_sub_if_data *sdata,
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
/* in an AP all links are always active */
sdata->vif.active_links = valid_links;
@@ -278,12 +356,16 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata,
ieee80211_set_vif_links_bitmaps(sdata, new_links, dormant_links);
/* tell the driver */
- ret = drv_change_vif_links(sdata->local, sdata,
- old_links & old_active,
- new_links & sdata->vif.active_links,
- old);
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+ ret = drv_change_vif_links(sdata->local, sdata,
+ old_links & old_active,
+ new_links & sdata->vif.active_links,
+ old);
if (!new_links)
ieee80211_debugfs_recreate_netdev(sdata, false);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ ieee80211_update_apvlan_links(sdata);
}
if (ret) {
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 7257f5610af5..5cc56d578048 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/unaligned.h>
+#include <net/sock.h>
#include "ieee80211_i.h"
#include "mesh.h"
#include "wme.h"
@@ -776,7 +777,7 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata,
if (ethertype < ETH_P_802_3_MIN)
return false;
- if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
+ if (sk_requests_wifi_status(skb->sk))
return false;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -956,13 +957,10 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
u8 *pos;
struct ieee80211_sub_if_data *sdata;
int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon);
- u32 rate_flags;
sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh);
sband = ieee80211_get_sband(sdata);
- rate_flags =
- ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
ie_len_he_cap = ieee80211_ie_len_he_cap(sdata);
ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata);
@@ -1091,7 +1089,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
if (ieee80211_put_srates_elem(skb, sband,
sdata->vif.bss_conf.basic_rates,
- rate_flags, 0, WLAN_EID_SUPP_RATES) ||
+ 0, WLAN_EID_SUPP_RATES) ||
mesh_add_ds_params_ie(sdata, skb))
goto out_free;
@@ -1104,7 +1102,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
if (ieee80211_put_srates_elem(skb, sband,
sdata->vif.bss_conf.basic_rates,
- rate_flags, 0, WLAN_EID_EXT_SUPP_RATES) ||
+ 0, WLAN_EID_EXT_SUPP_RATES) ||
mesh_add_rsn_ie(sdata, skb) ||
mesh_add_ht_cap_ie(sdata, skb) ||
mesh_add_ht_oper_ie(sdata, skb) ||
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index c94a9c7ca960..91444301a84a 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -636,7 +636,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
mesh_path_add_gate(mpath);
}
rcu_read_unlock();
- } else {
+ } else if (ifmsh->mshcfg.dot11MeshForwarding) {
rcu_read_lock();
mpath = mesh_path_lookup(sdata, target_addr);
if (mpath) {
@@ -654,6 +654,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
}
}
rcu_read_unlock();
+ } else {
+ forward = false;
}
if (reply) {
@@ -671,7 +673,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
}
}
- if (forward && ifmsh->mshcfg.dot11MeshForwarding) {
+ if (forward) {
u32 preq_id;
u8 hopcount;
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 9f9cb5af0a97..0319674be832 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -22,7 +22,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath);
static u32 mesh_table_hash(const void *addr, u32 len, u32 seed)
{
/* Use last four bytes of hw addr as hash index */
- return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed);
+ return jhash_1word(get_unaligned((u32 *)((u8 *)addr + 2)), seed);
}
static const struct rhashtable_params mesh_rht_params = {
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 96e0a861886a..9c6a2b342170 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -264,7 +264,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
if (action != WLAN_SP_MESH_PEERING_CLOSE) {
struct ieee80211_supported_band *sband;
- u32 rate_flags, basic_rates;
+ u32 basic_rates;
sband = ieee80211_get_sband(sdata);
if (!sband) {
@@ -280,16 +280,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
put_unaligned_le16(sta->sta.aid, pos);
}
- rate_flags =
- ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
basic_rates = sdata->vif.bss_conf.basic_rates;
if (ieee80211_put_srates_elem(skb, sband, basic_rates,
- rate_flags, 0,
- WLAN_EID_SUPP_RATES) ||
+ 0, WLAN_EID_SUPP_RATES) ||
ieee80211_put_srates_elem(skb, sband, basic_rates,
- rate_flags, 0,
- WLAN_EID_EXT_SUPP_RATES) ||
+ 0, WLAN_EID_EXT_SUPP_RATES) ||
mesh_add_rsn_ie(sdata, skb) ||
mesh_add_meshid_ie(sdata, skb) ||
mesh_add_meshconf_ie(sdata, skb))
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 35eaf0812c5b..b84150dbfe8c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1525,9 +1525,9 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local,
rates = ~0;
}
- ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates,
+ ieee80211_put_srates_elem(skb, sband, 0, ~rates,
WLAN_EID_SUPP_RATES);
- ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates,
+ ieee80211_put_srates_elem(skb, sband, 0, ~rates,
WLAN_EID_EXT_SUPP_RATES);
}
diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c
index 6da39c864f45..96584b39215e 100644
--- a/net/mac80211/parse.c
+++ b/net/mac80211/parse.c
@@ -1101,7 +1101,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width,
const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates)
{
- u32 rate_flags = ieee80211_chanwidth_rate_flags(width);
struct ieee80211_rate *br;
int brate, rate, i, j, count = 0;
@@ -1112,8 +1111,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width,
for (j = 0; j < sband->n_bitrates; j++) {
br = &sband->bitrates[j];
- if ((rate_flags & br->flags) != rate_flags)
- continue;
brate = DIV_ROUND_UP(br->bitrate, 5);
if (brate == rate) {
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 0d056db9f81e..3cb2ad6d0b28 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -368,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw,
struct ieee80211_tx_info *info,
u32 rate_mask)
{
+ u32 rate_flags = 0;
int i;
- u32 rate_flags =
- ieee80211_chandef_rate_flags(&hw->conf.chandef);
if (sband->band == NL80211_BAND_S1GHZ) {
info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS;
@@ -778,14 +777,9 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN],
u16 vht_mask[NL80211_VHT_NSS_MAX])
{
- u32 i, flags;
+ u32 i;
*mask = sdata->rc_rateidx_mask[sband->band];
- flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
- for (i = 0; i < sband->n_bitrates; i++) {
- if ((flags & sband->bitrates[i].flags) != flags)
- *mask &= ~BIT(i);
- }
if (*mask == (1 << sband->n_bitrates) - 1 &&
!sdata->rc_has_mcs_mask[sband->band] &&
@@ -990,8 +984,6 @@ int rate_control_set_rates(struct ieee80211_hw *hw,
if (sta->uploaded)
drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta);
- ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta));
-
return 0;
}
EXPORT_SYMBOL(rate_control_set_rates);
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 706cbc99f718..f66910013218 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1873,16 +1873,13 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
static void
minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband,
- const s16 *bitrates, int n_rates, u32 rate_flags)
+ const s16 *bitrates, int n_rates)
{
int i, j;
for (i = 0; i < sband->n_bitrates; i++) {
struct ieee80211_rate *rate = &sband->bitrates[i];
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- continue;
-
for (j = 0; j < n_rates; j++) {
if (rate->bitrate != bitrates[j])
continue;
@@ -1898,7 +1895,6 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp)
{
static const s16 bitrates[4] = { 10, 20, 55, 110 };
struct ieee80211_supported_band *sband;
- u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates));
sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
@@ -1908,8 +1904,7 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp)
BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates));
minstrel_ht_fill_rate_array(mp->cck_rates, sband,
minstrel_cck_bitrates,
- ARRAY_SIZE(minstrel_cck_bitrates),
- rate_flags);
+ ARRAY_SIZE(minstrel_cck_bitrates));
}
static void
@@ -1917,7 +1912,6 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band)
{
static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 };
struct ieee80211_supported_band *sband;
- u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band]));
sband = mp->hw->wiphy->bands[band];
@@ -1927,8 +1921,7 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band)
BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates));
minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband,
minstrel_ofdm_bitrates,
- ARRAY_SIZE(minstrel_ofdm_bitrates),
- rate_flags);
+ ARRAY_SIZE(minstrel_ofdm_bitrates));
}
static void *
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index cb7079071885..7b8da40a912d 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -240,6 +240,9 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel,
u32 scan_flags, const u8 *da)
{
+ struct ieee80211_link_data *link_sdata;
+ u8 link_id;
+
if (!sdata)
return false;
@@ -251,7 +254,20 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata,
if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR)
return true;
- return ether_addr_equal(da, sdata->vif.addr);
+
+ if (ether_addr_equal(da, sdata->vif.addr))
+ return true;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ link_sdata = rcu_dereference(sdata->link[link_id]);
+ if (!link_sdata)
+ continue;
+
+ if (ether_addr_equal(da, link_sdata->conf->addr))
+ return true;
+ }
+
+ return false;
}
void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index c6015cd00372..7422888d3640 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -147,14 +147,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
u32 control_freq, center_freq1, center_freq2;
enum nl80211_chan_width chan_width;
- struct {
- struct ieee80211_he_operation _oper;
- struct ieee80211_he_6ghz_oper _6ghz_oper;
- } __packed he;
- struct {
- struct ieee80211_eht_operation _oper;
- struct ieee80211_eht_operation_info _oper_info;
- } __packed eht;
+ DEFINE_RAW_FLEX(struct ieee80211_he_operation, he, optional,
+ sizeof(struct ieee80211_he_6ghz_oper));
+ struct ieee80211_he_6ghz_oper *_6ghz_oper =
+ (struct ieee80211_he_6ghz_oper *)he->optional;
+ DEFINE_RAW_FLEX(struct ieee80211_eht_operation, eht, optional,
+ sizeof(struct ieee80211_eht_operation_info));
+ struct ieee80211_eht_operation_info *_oper_info =
+ (struct ieee80211_eht_operation_info *)eht->optional;
const struct ieee80211_eht_operation *eht_oper;
if (conn->mode < IEEE80211_CONN_MODE_HE) {
@@ -167,38 +167,38 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata,
center_freq2 = chandef->center_freq2;
chan_width = chandef->width;
- he._oper.he_oper_params =
+ he->he_oper_params =
le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO);
- he._6ghz_oper.primary =
+ _6ghz_oper->primary =
ieee80211_frequency_to_channel(control_freq);
- he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1);
- he._6ghz_oper.ccfs1 = center_freq2 ?
+ _6ghz_oper->ccfs0 = ieee80211_frequency_to_channel(center_freq1);
+ _6ghz_oper->ccfs1 = center_freq2 ?
ieee80211_frequency_to_channel(center_freq2) : 0;
switch (chan_width) {
case NL80211_CHAN_WIDTH_320:
- he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0;
- he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16;
- he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ;
+ _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0;
+ _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -16 : 16;
+ _6ghz_oper->control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ;
break;
case NL80211_CHAN_WIDTH_160:
- he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0;
- he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8;
+ _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0;
+ _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -8 : 8;
fallthrough;
case NL80211_CHAN_WIDTH_80P80:
- he._6ghz_oper.control =
+ _6ghz_oper->control =
IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
break;
case NL80211_CHAN_WIDTH_80:
- he._6ghz_oper.control =
+ _6ghz_oper->control =
IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ;
break;
case NL80211_CHAN_WIDTH_40:
- he._6ghz_oper.control =
+ _6ghz_oper->control =
IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ;
break;
default:
- he._6ghz_oper.control =
+ _6ghz_oper->control =
IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ;
break;
}
@@ -206,15 +206,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata,
if (conn->mode < IEEE80211_CONN_MODE_EHT) {
eht_oper = NULL;
} else {
- eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT;
- eht._oper_info.control = he._6ghz_oper.control;
- eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0;
- eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1;
- eht_oper = &eht._oper;
+ eht->params = IEEE80211_EHT_OPER_INFO_PRESENT;
+ _oper_info->control = _6ghz_oper->control;
+ _oper_info->ccfs0 = _6ghz_oper->ccfs0;
+ _oper_info->ccfs1 = _6ghz_oper->ccfs1;
+ eht_oper = eht;
}
- if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper,
- eht_oper, chandef))
+ if (!ieee80211_chandef_he_6ghz_oper(local, he, eht_oper, chandef))
chandef->chan = NULL;
}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 248e1f63bf73..84b18be1f0b1 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -18,7 +18,6 @@
#include <linux/timer.h>
#include <linux/rtnetlink.h>
-#include <net/codel.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
@@ -701,12 +700,6 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata,
}
}
- sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD;
- sta->cparams.target = MS2TIME(20);
- sta->cparams.interval = MS2TIME(100);
- sta->cparams.ecn = true;
- sta->cparams.ce_threshold_selector = 0;
- sta->cparams.ce_threshold_mask = 0;
sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
@@ -2905,27 +2898,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta)
return sta->deflink.status_stats.last_ack;
}
-static void sta_update_codel_params(struct sta_info *sta, u32 thr)
-{
- if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {
- sta->cparams.target = MS2TIME(50);
- sta->cparams.interval = MS2TIME(300);
- sta->cparams.ecn = false;
- } else {
- sta->cparams.target = MS2TIME(20);
- sta->cparams.interval = MS2TIME(100);
- sta->cparams.ecn = true;
- }
-}
-
-void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
- u32 thr)
-{
- struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-
- sta_update_codel_params(sta, thr);
-}
-
int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 07b7ec39a52f..7a95d8d34fca 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -466,14 +466,6 @@ struct ieee80211_fragment_cache {
unsigned int next;
};
-/*
- * The bandwidth threshold below which the per-station CoDel parameters will be
- * scaled to be more lenient (to prevent starvation of slow stations). This
- * value will be scaled by the number of active stations when it is being
- * applied.
- */
-#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */
-
/**
* struct link_sta_info - Link STA information
* All link specific sta info are stored here for reference. This can be
@@ -626,7 +618,6 @@ struct link_sta_info {
* @sta: station information we share with the driver
* @sta_state: duplicates information about station state (for debug)
* @rcu_head: RCU head used for freeing this station struct
- * @cparams: CoDel parameters for this station.
* @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
* @amsdu_mesh_control: track the mesh A-MSDU format used by the peer:
*
@@ -717,8 +708,6 @@ struct sta_info {
struct dentry *debugfs_dir;
#endif
- struct codel_params cparams;
-
u8 reserved_tid;
s8 amsdu_mesh_control;
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 2f92e7c7f203..94714f8ffd22 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -382,8 +382,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link,
if (WARN_ON_ONCE(!sband))
return;
- ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_SUPP_RATES);
- ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_EXT_SUPP_RATES);
+ ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_SUPP_RATES);
+ ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_EXT_SUPP_RATES);
ieee80211_tdls_add_supp_channels(sdata, skb);
/* add any custom IEs that go before Extended Capabilities */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 20179db88c4a..d8d4f3d7d7f2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -26,6 +26,7 @@
#include <net/codel_impl.h>
#include <linux/unaligned.h>
#include <net/fq_impl.h>
+#include <net/sock.h>
#include <net/gso.h>
#include "ieee80211_i.h"
@@ -49,19 +50,11 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
struct ieee80211_supported_band *sband;
struct ieee80211_hdr *hdr;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
- struct ieee80211_chanctx_conf *chanctx_conf;
- u32 rate_flags = 0;
/* assume HW handles this */
if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
return 0;
- rcu_read_lock();
- chanctx_conf = rcu_dereference(tx->sdata->vif.bss_conf.chanctx_conf);
- if (chanctx_conf)
- rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
- rcu_read_unlock();
-
/* uh huh? */
if (WARN_ON_ONCE(tx->rate.idx < 0))
return 0;
@@ -138,9 +131,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
if (r->bitrate > txrate->bitrate)
break;
- if ((rate_flags & r->flags) != rate_flags)
- continue;
-
if (tx->sdata->vif.bss_conf.basic_rates & BIT(i))
rate = r->bitrate;
@@ -1402,16 +1392,9 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq,
local = container_of(fq, struct ieee80211_local, fq);
txqi = container_of(tin, struct txq_info, tin);
+ cparams = &local->cparams;
cstats = &txqi->cstats;
- if (txqi->txq.sta) {
- struct sta_info *sta = container_of(txqi->txq.sta,
- struct sta_info, sta);
- cparams = &sta->cparams;
- } else {
- cparams = &local->cparams;
- }
-
if (flow == &tin->default_flow)
cvars = &txqi->def_cvars;
else
@@ -2876,8 +2859,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
}
if (unlikely(!multicast &&
- ((skb->sk &&
- skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) ||
+ (sk_requests_wifi_status(skb->sk) ||
ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS)))
info_id = ieee80211_store_ack_skb(local, skb, &info_flags,
cookie);
@@ -3774,7 +3756,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
return false;
/* don't handle TX status request here either */
- if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
+ if (sk_requests_wifi_status(skb->sk))
return false;
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
@@ -4526,8 +4508,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
IEEE80211_TX_CTRL_MLO_LINK_UNSPEC,
NULL);
} else if (ieee80211_vif_is_mld(&sdata->vif) &&
- sdata->vif.type == NL80211_IFTYPE_AP &&
- !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) {
+ ((sdata->vif.type == NL80211_IFTYPE_AP &&
+ !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) ||
+ (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ !sdata->wdev.use_4addr))) {
ieee80211_mlo_multicast_tx(dev, skb);
} else {
normal:
@@ -4664,8 +4648,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info));
}
- if (unlikely(skb->sk &&
- skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) {
+ if (unlikely(sk_requests_wifi_status(skb->sk))) {
info->status_data = ieee80211_store_ack_skb(local, skb,
&info->flags, NULL);
if (info->status_data)
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index dec6e16b8c7d..27d414efa3fd 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1204,7 +1204,6 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb,
struct ieee80211_supported_band *sband;
int i, err;
size_t noffset;
- u32 rate_flags;
bool have_80mhz = false;
*offset = 0;
@@ -1213,13 +1212,11 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb,
if (WARN_ON_ONCE(!sband))
return 0;
- rate_flags = ieee80211_chandef_rate_flags(chandef);
-
/* For direct scan add S1G IE and consider its override bits */
if (band == NL80211_BAND_S1GHZ)
return ieee80211_put_s1g_cap(skb, &sband->s1g_cap);
- err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags,
+ err = ieee80211_put_srates_elem(skb, sband, 0,
~rate_mask, WLAN_EID_SUPP_RATES);
if (err)
return err;
@@ -1241,7 +1238,7 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb,
*offset = noffset;
}
- err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags,
+ err = ieee80211_put_srates_elem(skb, sband, 0,
~rate_mask, WLAN_EID_EXT_SUPP_RATES);
if (err)
return err;
@@ -1522,16 +1519,13 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_supported_band *sband;
size_t num_rates;
- u32 supp_rates, rate_flags;
+ u32 supp_rates;
int i, j;
sband = sdata->local->hw.wiphy->bands[band];
if (WARN_ON(!sband))
return 1;
- rate_flags =
- ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
-
num_rates = sband->n_bitrates;
supp_rates = 0;
for (i = 0; i < elems->supp_rates_len +
@@ -1551,12 +1545,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
continue;
for (j = 0; j < num_rates; j++) {
- int brate;
- if ((rate_flags & sband->bitrates[j].flags)
- != rate_flags)
- continue;
-
- brate = sband->bitrates[j].bitrate;
+ int brate = sband->bitrates[j].bitrate;
if (brate == own_rate) {
supp_rates |= BIT(j);
@@ -3223,15 +3212,13 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
int ieee80211_put_srates_elem(struct sk_buff *skb,
const struct ieee80211_supported_band *sband,
- u32 basic_rates, u32 rate_flags, u32 masked_rates,
+ u32 basic_rates, u32 masked_rates,
u8 element_id)
{
u8 i, rates, skip;
rates = 0;
for (i = 0; i < sband->n_bitrates; i++) {
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- continue;
if (masked_rates & BIT(i))
continue;
rates++;
@@ -3257,8 +3244,6 @@ int ieee80211_put_srates_elem(struct sk_buff *skb,
int rate;
u8 basic;
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- continue;
if (masked_rates & BIT(i))
continue;
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 7c0dcf3df319..4d404edd7446 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -120,8 +120,8 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
int ifindex = 0, rc;
/* Filter by ifindex if a header is provided */
- if (cb->nlh->nlmsg_len >= nlmsg_msg_size(sizeof(*hdr))) {
- hdr = nlmsg_data(cb->nlh);
+ hdr = nlmsg_payload(cb->nlh, sizeof(*hdr));
+ if (hdr) {
ifindex = hdr->ifa_index;
} else {
if (cb->strict_check) {
diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c
index 590f642413e4..05b899f22d90 100644
--- a/net/mctp/neigh.c
+++ b/net/mctp/neigh.c
@@ -250,7 +250,10 @@ static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb)
int idx;
} *cbctx = (void *)cb->ctx;
- ndmsg = nlmsg_data(cb->nlh);
+ ndmsg = nlmsg_payload(cb->nlh, sizeof(*ndmsg));
+ if (!ndmsg)
+ return -EINVAL;
+
req_ifindex = ndmsg->ndm_ifindex;
idx = 0;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 1f63b32d76d6..d536c97144e9 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2095,12 +2095,12 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
struct rtmsg *rtm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
return -EINVAL;
}
- rtm = nlmsg_data(nlh);
if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type ||
rtm->rtm_flags) {
@@ -2288,7 +2288,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG_MOD(extack,
"Invalid header for get route request");
return -EINVAL;
@@ -2298,7 +2299,6 @@ static int mpls_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_mpls_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) ||
rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table ||
rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) {
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 19eb9292bd60..0c24545f0e8d 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -28,6 +28,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED),
SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX),
SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR),
SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR),
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 128282982843..250c6b77977e 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -23,6 +23,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */
MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */
MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */
MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 31747f974941..1306d4dc287b 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -151,10 +151,13 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
+ bool ret;
entry = mptcp_pm_del_add_timer(msk, addr, false);
+ ret = entry;
kfree(entry);
- return entry;
+
+ return ret;
}
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 44f7ab463d75..0749733ea897 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -46,7 +46,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
-DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
@@ -3142,9 +3144,9 @@ static int mptcp_disconnect(struct sock *sk, int flags)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
- unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
+ struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk);
- return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+ return &msk6->np;
}
static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk)
@@ -3527,8 +3529,10 @@ bool mptcp_finish_join(struct sock *ssk)
return true;
}
- if (!mptcp_pm_allow_new_subflow(msk))
+ if (!mptcp_pm_allow_new_subflow(msk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED);
goto err_prohibited;
+ }
/* If we can't acquire msk socket lock here, let the release callback
* handle it
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d409586b5977..3dd11dd3ba16 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -479,6 +479,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
struct mptcp_delegated_action {
struct napi_struct napi;
+ local_lock_t bh_lock;
struct list_head head;
};
@@ -670,9 +671,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow,
if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node)))
return;
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
delegated = this_cpu_ptr(&mptcp_delegated_actions);
schedule = list_empty(&delegated->head);
list_add_tail(&subflow->delegated_node, &delegated->head);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
sock_hold(mptcp_subflow_tcp_sock(subflow));
if (schedule)
napi_schedule(&delegated->napi);
@@ -684,11 +687,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
{
struct mptcp_subflow_context *ret;
- if (list_empty(&delegated->head))
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
+ if (list_empty(&delegated->head)) {
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return NULL;
+ }
ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
list_del_init(&ret->delegated_node);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return ret;
}
@@ -744,6 +751,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_validate_scheduler(struct mptcp_sched_ops *sched);
int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
void mptcp_sched_init(void);
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index c16c6fbd4ba2..1e59072d478c 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -16,8 +16,7 @@
static DEFINE_SPINLOCK(mptcp_sched_list_lock);
static LIST_HEAD(mptcp_sched_list);
-static int mptcp_sched_default_get_send(struct mptcp_sock *msk,
- struct mptcp_sched_data *data)
+static int mptcp_sched_default_get_send(struct mptcp_sock *msk)
{
struct sock *ssk;
@@ -29,8 +28,7 @@ static int mptcp_sched_default_get_send(struct mptcp_sock *msk,
return 0;
}
-static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk,
- struct mptcp_sched_data *data)
+static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk)
{
struct sock *ssk;
@@ -84,10 +82,23 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen)
rcu_read_unlock();
}
-int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+int mptcp_validate_scheduler(struct mptcp_sched_ops *sched)
{
- if (!sched->get_send)
+ if (!sched->get_send) {
+ pr_err("%s does not implement required ops\n", sched->name);
return -EINVAL;
+ }
+
+ return 0;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ int ret;
+
+ ret = mptcp_validate_scheduler(sched);
+ if (ret)
+ return ret;
spin_lock(&mptcp_sched_list_lock);
if (mptcp_sched_find(sched->name)) {
@@ -157,7 +168,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
int mptcp_sched_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- struct mptcp_sched_data *data = NULL;
msk_owned_by_me(msk);
@@ -178,14 +188,13 @@ int mptcp_sched_get_send(struct mptcp_sock *msk)
}
if (msk->sched == &mptcp_sched_default || !msk->sched)
- return mptcp_sched_default_get_send(msk, data);
- return msk->sched->get_send(msk, data);
+ return mptcp_sched_default_get_send(msk);
+ return msk->sched->get_send(msk);
}
int mptcp_sched_get_retrans(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- struct mptcp_sched_data *data = NULL;
msk_owned_by_me(msk);
@@ -199,8 +208,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk)
}
if (msk->sched == &mptcp_sched_default || !msk->sched)
- return mptcp_sched_default_get_retrans(msk, data);
+ return mptcp_sched_default_get_retrans(msk);
if (msk->sched->get_retrans)
- return msk->sched->get_retrans(msk, data);
- return msk->sched->get_send(msk, data);
+ return msk->sched->get_retrans(msk);
+ return msk->sched->get_send(msk);
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 24c2de1891bd..15613d691bfe 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -247,6 +247,7 @@ again:
if (unlikely(req->syncookie)) {
if (!mptcp_can_accept_new_subflow(subflow_req->msk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED);
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
}
@@ -745,15 +746,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op
EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
/* validate hmac received in third ACK */
-static bool subflow_hmac_valid(const struct request_sock *req,
+static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req,
const struct mptcp_options_received *mp_opt)
{
- const struct mptcp_subflow_request_sock *subflow_req;
+ struct mptcp_sock *msk = subflow_req->msk;
u8 hmac[SHA256_DIGEST_SIZE];
- struct mptcp_sock *msk;
-
- subflow_req = mptcp_subflow_rsk(req);
- msk = subflow_req->msk;
subflow_generate_hmac(READ_ONCE(msk->remote_key),
READ_ONCE(msk->local_key),
@@ -899,13 +896,14 @@ create_child:
goto dispose_child;
}
- if (!subflow_hmac_valid(req, &mp_opt)) {
+ if (!subflow_hmac_valid(subflow_req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
}
if (!mptcp_can_accept_new_subflow(owner)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED);
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
goto dispose_child;
}
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 4e0842df5234..e76c6de0c784 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter {
};
struct ncsi_channel_stats {
- u32 hnc_cnt_hi; /* Counter cleared */
- u32 hnc_cnt_lo; /* Counter cleared */
- u32 hnc_rx_bytes; /* Rx bytes */
- u32 hnc_tx_bytes; /* Tx bytes */
- u32 hnc_rx_uc_pkts; /* Rx UC packets */
- u32 hnc_rx_mc_pkts; /* Rx MC packets */
- u32 hnc_rx_bc_pkts; /* Rx BC packets */
- u32 hnc_tx_uc_pkts; /* Tx UC packets */
- u32 hnc_tx_mc_pkts; /* Tx MC packets */
- u32 hnc_tx_bc_pkts; /* Tx BC packets */
+ u64 hnc_cnt; /* Counter cleared */
+ u64 hnc_rx_bytes; /* Rx bytes */
+ u64 hnc_tx_bytes; /* Tx bytes */
+ u64 hnc_rx_uc_pkts; /* Rx UC packets */
+ u64 hnc_rx_mc_pkts; /* Rx MC packets */
+ u64 hnc_rx_bc_pkts; /* Rx BC packets */
+ u64 hnc_tx_uc_pkts; /* Tx UC packets */
+ u64 hnc_tx_mc_pkts; /* Tx MC packets */
+ u64 hnc_tx_bc_pkts; /* Tx BC packets */
u32 hnc_fcs_err; /* FCS errors */
u32 hnc_align_err; /* Alignment errors */
u32 hnc_false_carrier; /* False carrier detection */
@@ -181,7 +180,7 @@ struct ncsi_channel_stats {
u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */
u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */
u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */
- u32 hnc_rx_valid_bytes; /* Rx valid bytes */
+ u64 hnc_rx_valid_bytes; /* Rx valid bytes */
u32 hnc_rx_runt_pkts; /* Rx error runt packets */
u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */
u32 ncsi_rx_cmds; /* Rx NCSI commands */
@@ -323,7 +322,7 @@ struct ncsi_dev_priv {
#define NCSI_DEV_RESHUFFLE 4
#define NCSI_DEV_RESET 8 /* Reset state of NC */
unsigned int gma_flag; /* OEM GMA flag */
- struct sockaddr pending_mac; /* MAC address received from GMA */
+ struct sockaddr_storage pending_mac; /* MAC address received from GMA */
spinlock_t lock; /* Protect the NCSI device */
unsigned int package_probe_id;/* Current ID during probe */
unsigned int package_num; /* Number of packages */
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index f2f3b5c1b941..24edb2737972 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt {
/* Get Controller Packet Statistics */
struct ncsi_rsp_gcps_pkt {
struct ncsi_rsp_pkt_hdr rsp; /* Response header */
- __be32 cnt_hi; /* Counter cleared */
- __be32 cnt_lo; /* Counter cleared */
- __be32 rx_bytes; /* Rx bytes */
- __be32 tx_bytes; /* Tx bytes */
- __be32 rx_uc_pkts; /* Rx UC packets */
- __be32 rx_mc_pkts; /* Rx MC packets */
- __be32 rx_bc_pkts; /* Rx BC packets */
- __be32 tx_uc_pkts; /* Tx UC packets */
- __be32 tx_mc_pkts; /* Tx MC packets */
- __be32 tx_bc_pkts; /* Tx BC packets */
+ __be64 cnt; /* Counter cleared */
+ __be64 rx_bytes; /* Rx bytes */
+ __be64 tx_bytes; /* Tx bytes */
+ __be64 rx_uc_pkts; /* Rx UC packets */
+ __be64 rx_mc_pkts; /* Rx MC packets */
+ __be64 rx_bc_pkts; /* Rx BC packets */
+ __be64 tx_uc_pkts; /* Tx UC packets */
+ __be64 tx_mc_pkts; /* Tx MC packets */
+ __be64 tx_bc_pkts; /* Tx BC packets */
__be32 fcs_err; /* FCS errors */
__be32 align_err; /* Alignment errors */
__be32 false_carrier; /* False carrier detection */
@@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt {
__be32 tx_1023_frames; /* Tx 512-1023 bytes frames */
__be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */
__be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */
- __be32 rx_valid_bytes; /* Rx valid bytes */
+ __be64 rx_valid_bytes; /* Rx valid bytes */
__be32 rx_runt_pkts; /* Rx error runt packets */
__be32 rx_jabber_pkts; /* Rx error jabber packets */
__be32 checksum; /* Checksum */
-};
+} __packed __aligned(4);
/* Get NCSI Statistics */
struct ncsi_rsp_gns_pkt {
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 4a8ce2949fae..472cc68ad86f 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -628,7 +628,7 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id)
{
struct ncsi_dev_priv *ndp = nr->ndp;
- struct sockaddr *saddr = &ndp->pending_mac;
+ struct sockaddr_storage *saddr = &ndp->pending_mac;
struct net_device *ndev = ndp->ndev.dev;
struct ncsi_rsp_oem_pkt *rsp;
u32 mac_addr_off = 0;
@@ -644,11 +644,11 @@ static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id)
else if (mfr_id == NCSI_OEM_MFR_INTEL_ID)
mac_addr_off = INTEL_MAC_ADDR_OFFSET;
- saddr->sa_family = ndev->type;
- memcpy(saddr->sa_data, &rsp->data[mac_addr_off], ETH_ALEN);
+ saddr->ss_family = ndev->type;
+ memcpy(saddr->__data, &rsp->data[mac_addr_off], ETH_ALEN);
if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID)
- eth_addr_inc((u8 *)saddr->sa_data);
- if (!is_valid_ether_addr((const u8 *)saddr->sa_data))
+ eth_addr_inc(saddr->__data);
+ if (!is_valid_ether_addr(saddr->__data))
return -ENXIO;
/* Set the flag for GMA command which should only be called once */
@@ -926,16 +926,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr)
/* Update HNC's statistics */
ncs = &nc->stats;
- ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi);
- ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo);
- ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes);
- ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes);
- ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts);
- ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts);
- ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts);
- ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts);
- ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts);
- ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts);
+ ncs->hnc_cnt = be64_to_cpu(rsp->cnt);
+ ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes);
+ ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes);
+ ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts);
+ ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts);
+ ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts);
+ ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts);
+ ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts);
+ ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts);
ncs->hnc_fcs_err = ntohl(rsp->fcs_err);
ncs->hnc_align_err = ntohl(rsp->align_err);
ncs->hnc_false_carrier = ntohl(rsp->false_carrier);
@@ -964,7 +963,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr)
ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames);
ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames);
ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames);
- ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes);
+ ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes);
ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts);
ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts);
@@ -1089,7 +1088,7 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr)
{
struct ncsi_dev_priv *ndp = nr->ndp;
- struct sockaddr *saddr = &ndp->pending_mac;
+ struct sockaddr_storage *saddr = &ndp->pending_mac;
struct net_device *ndev = ndp->ndev.dev;
struct ncsi_rsp_gmcma_pkt *rsp;
int i;
@@ -1106,15 +1105,15 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr)
rsp->addresses[i][4], rsp->addresses[i][5]);
}
- saddr->sa_family = ndev->type;
+ saddr->ss_family = ndev->type;
for (i = 0; i < rsp->address_count; i++) {
if (!is_valid_ether_addr(rsp->addresses[i])) {
netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n",
rsp->addresses[i]);
continue;
}
- memcpy(saddr->sa_data, rsp->addresses[i], ETH_ALEN);
- netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->sa_data);
+ memcpy(saddr->__data, rsp->addresses[i], ETH_ALEN);
+ netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->__data);
break;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 047ba81865ed..2560416218d0 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -212,7 +212,7 @@ config NF_CT_PROTO_SCTP
bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default y
- select CRC32
+ select NET_CRC32C
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
@@ -475,7 +475,7 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
- select CRC32
+ select NET_CRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
@@ -1180,7 +1180,7 @@ config NETFILTER_XT_MATCH_CGROUP
tristate '"control group" match support'
depends on NETFILTER_ADVANCED
depends on CGROUPS
- select CGROUP_NET_CLASSID
+ select SOCK_CGROUP_DATA
help
Socket/process control group matching allows you to match locally
generated packets based on which net_cls control group processes
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b9f551f02c81..11a702065bab 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -31,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
-DEFINE_PER_CPU(bool, nf_skb_duplicated);
-EXPORT_SYMBOL_GPL(nf_skb_duplicated);
-
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 8c5b1fe12d07..c203252e856d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -105,7 +105,7 @@ config IP_VS_PROTO_AH
config IP_VS_PROTO_SCTP
bool "SCTP load balancing support"
- select CRC32
+ select NET_CRC32C
help
This option enables support for load balancing SCTP transport
protocol. Say Y if unsure.
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..201d3c4ec623 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct)
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);
+static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct)
+{
+ return nf_ct_get_id(nf_ct_to_nf_conn(nfct));
+}
+
static void
clean_from_lists(struct nf_conn *ct)
{
@@ -531,10 +536,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
p = tmpl;
tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
- if (tmpl != p) {
- tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ if (tmpl != p)
tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
- }
} else {
tmpl = kzalloc(sizeof(*tmpl), flags);
if (!tmpl)
@@ -2712,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
.attach = nf_conntrack_attach,
.set_closing = nf_conntrack_set_closing,
.confirm = __nf_conntrack_confirm,
+ .get_id = nf_conntrack_get_id,
};
void nf_conntrack_init_end(void)
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..6c4cff10357d 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -98,69 +98,87 @@ struct ct_iter_state {
struct seq_net_private p;
struct hlist_nulls_head *hash;
unsigned int htable_size;
+ unsigned int skip_elems;
unsigned int bucket;
u_int64_t time_now;
};
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net,
+ struct ct_iter_state *st)
{
- struct ct_iter_state *st = seq->private;
+ struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int i;
- for (st->bucket = 0;
- st->bucket < st->htable_size;
- st->bucket++) {
- n = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- if (!is_a_nulls(n))
- return n;
- }
- return NULL;
-}
+ for (i = st->bucket; i < st->htable_size; i++) {
+ unsigned int skip = 0;
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
- struct hlist_nulls_node *head)
-{
- struct ct_iter_state *st = seq->private;
+restart:
+ hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct hlist_nulls_node *tmp = n;
- head = rcu_dereference(hlist_nulls_next_rcu(head));
- while (is_a_nulls(head)) {
- if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= st->htable_size)
- return NULL;
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
+ if (++skip <= st->skip_elems)
+ continue;
+
+ /* h should be returned, skip to nulls marker. */
+ while (!is_a_nulls(tmp))
+ tmp = rcu_dereference(hlist_nulls_next_rcu(tmp));
+
+ /* check if h is still linked to hash[i] */
+ if (get_nulls_value(tmp) != i) {
+ skip = 0;
+ goto restart;
+ }
+
+ st->skip_elems = skip;
+ st->bucket = i;
+ return h;
}
- head = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- }
- return head;
-}
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_nulls_node *head = ct_get_first(seq);
+ skip = 0;
+ if (get_nulls_value(n) != i)
+ goto restart;
+
+ st->skip_elems = 0;
+ }
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
+ st->bucket = i;
+ return NULL;
}
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct ct_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
st->time_now = ktime_get_real_ns();
rcu_read_lock();
nf_conntrack_get_ht(&st->hash, &st->htable_size);
- return ct_get_idx(seq, *pos);
+
+ if (*pos == 0) {
+ st->skip_elems = 0;
+ st->bucket = 0;
+ } else if (st->skip_elems) {
+ /* resume from last dumped entry */
+ st->skip_elems--;
+ }
+
+ return ct_get_next(net, st);
}
static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct ct_iter_state *st = s->private;
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- return ct_get_next(s, v);
+ return ct_get_next(net, st);
}
static void ct_seq_stop(struct seq_file *s, void *v)
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index a8e2425e43b0..fab8b9011098 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -15,12 +15,26 @@
#define NF_RECURSION_LIMIT 2
-static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+#ifndef CONFIG_PREEMPT_RT
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+}
+#else
+
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return &current->net_xmit.nf_dup_skb_recursion;
+}
+
+#endif
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
- if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
+
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
goto err;
if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
@@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb_clear_tstamp(skb);
- __this_cpu_inc(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
- __this_cpu_dec(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)--;
return;
err:
kfree_skb(skb);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a133e1c175ce..24c71ecb2179 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -300,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
static int nft_netdev_register_hooks(struct net *net,
struct list_head *hook_list)
{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, j;
j = 0;
list_for_each_entry(hook, hook_list, list) {
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0)
- goto err_register;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nf_register_net_hook(net, ops);
+ if (err < 0)
+ goto err_register;
- j++;
+ j++;
+ }
}
return 0;
err_register:
list_for_each_entry(hook, hook_list, list) {
- if (j-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (j-- <= 0)
+ break;
- nf_unregister_net_hook(net, &hook->ops);
+ nf_unregister_net_hook(net, ops);
+ }
}
return err;
}
+static void nft_netdev_hook_free_ops(struct nft_hook *hook)
+{
+ struct nf_hook_ops *ops, *next;
+
+ list_for_each_entry_safe(ops, next, &hook->ops_list, list) {
+ list_del(&ops->list);
+ kfree(ops);
+ }
+}
+
+static void nft_netdev_hook_free(struct nft_hook *hook)
+{
+ nft_netdev_hook_free_ops(hook);
+ kfree(hook);
+}
+
+static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu)
+{
+ struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu);
+
+ nft_netdev_hook_free(hook);
+}
+
+static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
+{
+ call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
+}
+
static void nft_netdev_unregister_hooks(struct net *net,
struct list_head *hook_list,
bool release_netdev)
{
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
if (release_netdev) {
list_del(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
}
@@ -2253,7 +2288,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
module_put(basechain->type->owner);
@@ -2274,19 +2309,20 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
const struct nlattr *attr)
{
+ struct nf_hook_ops *ops;
struct net_device *dev;
struct nft_hook *hook;
int err;
hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
- if (!hook) {
- err = -ENOMEM;
- goto err_hook_alloc;
- }
+ if (!hook)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&hook->ops_list);
err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
if (err < 0)
- goto err_hook_dev;
+ goto err_hook_free;
hook->ifnamelen = nla_len(attr);
@@ -2294,18 +2330,22 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
* indirectly serializing all the other holders of the commit_mutex with
* the rtnl_mutex.
*/
- dev = __dev_get_by_name(net, hook->ifname);
- if (!dev) {
- err = -ENOENT;
- goto err_hook_dev;
- }
- hook->ops.dev = dev;
+ for_each_netdev(net, dev) {
+ if (strncmp(dev->name, hook->ifname, hook->ifnamelen))
+ continue;
+ ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
+ err = -ENOMEM;
+ goto err_hook_free;
+ }
+ ops->dev = dev;
+ list_add_tail(&ops->list, &hook->ops_list);
+ }
return hook;
-err_hook_dev:
- kfree(hook);
-err_hook_alloc:
+err_hook_free:
+ nft_netdev_hook_free(hook);
return ERR_PTR(err);
}
@@ -2315,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
- if (!strcmp(hook->ifname, this->ifname))
+ if (!strncmp(hook->ifname, this->ifname,
+ min(hook->ifnamelen, this->ifnamelen)))
return hook;
}
@@ -2345,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
}
if (nft_hook_list_find(hook_list, hook)) {
NL_SET_BAD_ATTR(extack, tmp);
- kfree(hook);
+ nft_netdev_hook_free(hook);
err = -EEXIST;
goto err_hook;
}
@@ -2363,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
err_hook:
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
}
return err;
}
@@ -2506,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
list_for_each_entry_safe(h, next, &hook->list, list) {
list_del(&h->list);
- kfree(h);
+ nft_netdev_hook_free(h);
}
module_put(hook->type->owner);
}
@@ -2559,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
struct nft_chain_hook *hook, u32 flags)
{
struct nft_chain *chain;
+ struct nf_hook_ops *ops;
struct nft_hook *h;
basechain->type = hook->type;
@@ -2567,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
- list_for_each_entry(h, &basechain->hook_list, list)
- nft_basechain_hook_init(&h->ops, family, hook, chain);
+ list_for_each_entry(h, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nft_basechain_hook_init(ops, family, hook, chain);
+ }
}
nft_basechain_hook_init(&basechain->ops, family, hook, chain);
@@ -2787,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(h, next, &hook.list, list) {
- h->ops.pf = basechain->ops.pf;
- h->ops.hooknum = basechain->ops.hooknum;
- h->ops.priority = basechain->ops.priority;
- h->ops.priv = basechain->ops.priv;
- h->ops.hook = basechain->ops.hook;
+ list_for_each_entry(ops, &h->ops_list, list) {
+ ops->pf = basechain->ops.pf;
+ ops->hooknum = basechain->ops.hooknum;
+ ops->priority = basechain->ops.priority;
+ ops->priv = basechain->ops.priv;
+ ops->hook = basechain->ops.hook;
+ }
if (nft_hook_list_find(&basechain->hook_list, h)) {
list_del(&h->list);
- kfree(h);
+ nft_netdev_hook_free(h);
}
}
} else {
@@ -2913,10 +2959,12 @@ err_trans:
err_hooks:
if (nla[NFTA_CHAIN_HOOK]) {
list_for_each_entry_safe(h, next, &hook.list, list) {
- if (unregister)
- nf_unregister_net_hook(ctx->net, &h->ops);
+ if (unregister) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nf_unregister_net_hook(ctx->net, ops);
+ }
list_del(&h->list);
- kfree_rcu(h, rcu);
+ nft_netdev_hook_free_rcu(h);
}
module_put(hook.type->owner);
}
@@ -4569,6 +4617,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
[NFTA_SET_EXPR] = { .type = NLA_NESTED },
[NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
+ [NFTA_SET_TYPE] = { .type = NLA_REJECT },
+ [NFTA_SET_COUNT] = { .type = NLA_REJECT },
};
static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
@@ -4763,6 +4813,27 @@ static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
return size;
}
+static noinline_for_stack int
+nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set)
+{
+ unsigned int nelems;
+ char str[40];
+ int ret;
+
+ ret = snprintf(str, sizeof(str), "%ps", set->ops);
+
+ /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped
+ * to userspace purely for informational/debug purposes.
+ */
+ DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str));
+
+ if (nla_put_string(skb, NFTA_SET_TYPE, str))
+ return -EMSGSIZE;
+
+ nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems));
+ return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems));
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -4843,6 +4914,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_nest_end(skb, nest);
+ if (nf_tables_fill_set_info(skb, set))
+ goto nla_put_failure;
+
if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
@@ -8759,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
struct netlink_ext_ack *extack, bool add)
{
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int hooknum, priority;
int err;
@@ -8813,11 +8888,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
}
list_for_each_entry(hook, &flowtable_hook->list, list) {
- hook->ops.pf = NFPROTO_NETDEV;
- hook->ops.hooknum = flowtable_hook->num;
- hook->ops.priority = flowtable_hook->priority;
- hook->ops.priv = &flowtable->data;
- hook->ops.hook = flowtable->data.type->hook;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable_hook->num;
+ ops->priority = flowtable_hook->priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ }
}
return err;
@@ -8859,12 +8936,12 @@ nft_flowtable_type_get(struct net *net, u8 family)
}
/* Only called from error and netdev event paths. */
-static void nft_unregister_flowtable_hook(struct net *net,
- struct nft_flowtable *flowtable,
- struct nft_hook *hook)
+static void nft_unregister_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
{
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ nf_unregister_net_hook(net, ops);
+ flowtable->data.type->setup(&flowtable->data, ops->dev,
FLOW_BLOCK_UNBIND);
}
@@ -8874,14 +8951,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net,
bool release_netdev)
{
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_UNBIND);
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(net, flowtable, ops);
if (release_netdev) {
list_del(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
}
@@ -8893,6 +8970,26 @@ static void nft_unregister_flowtable_net_hooks(struct net *net,
__nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
}
+static int nft_register_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
+{
+ int err;
+
+ err = flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_BIND);
+ if (err < 0)
+ return err;
+
+ err = nf_register_net_hook(net, ops);
+ if (!err)
+ return 0;
+
+ flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_UNBIND);
+ return err;
+}
+
static int nft_register_flowtable_net_hooks(struct net *net,
struct nft_table *table,
struct list_head *hook_list,
@@ -8900,6 +8997,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
{
struct nft_hook *hook, *next;
struct nft_flowtable *ft;
+ struct nf_hook_ops *ops;
int err, i = 0;
list_for_each_entry(hook, hook_list, list) {
@@ -8913,33 +9011,27 @@ static int nft_register_flowtable_net_hooks(struct net *net,
}
}
- err = flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_BIND);
- if (err < 0)
- goto err_unregister_net_hooks;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nft_register_flowtable_ops(net, flowtable, ops);
+ if (err < 0)
+ goto err_unregister_net_hooks;
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0) {
- flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_UNBIND);
- goto err_unregister_net_hooks;
+ i++;
}
-
- i++;
}
return 0;
err_unregister_net_hooks:
list_for_each_entry_safe(hook, next, hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- nft_unregister_flowtable_hook(net, flowtable, hook);
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
@@ -8951,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list)
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
@@ -8962,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
struct nft_trans *trans;
bool unregister = false;
u32 flags;
@@ -8975,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
if (nft_hook_list_find(&flowtable->hook_list, hook)) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
}
}
@@ -9019,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
err_flowtable_update_hook:
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
- if (unregister)
- nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
+ if (unregister) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(ctx->net,
+ flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
@@ -9168,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook
list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
list_del(&this->list);
- kfree(this);
+ nft_netdev_hook_free(this);
}
}
@@ -9531,7 +9627,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
flowtable->data.type->free(&flowtable->data);
list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
kfree(flowtable->name);
module_put(flowtable->data.type->owner);
@@ -9564,46 +9660,190 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void nft_flowtable_event(unsigned long event, struct net_device *dev,
- struct nft_flowtable *flowtable)
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops);
+
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_rcu(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
+
+static void
+nf_tables_device_notify(const struct nft_table *table, int attr,
+ const char *name, const struct nft_hook *hook,
+ const struct net_device *dev, int event)
+{
+ struct net *net = dev_net(dev);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ u16 flags = 0;
+
+ if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ goto err;
+
+ event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV;
+ event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
+ nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family,
+ NFNETLINK_V0, nft_base_seq(net));
+ if (!nlh)
+ goto err;
+
+ if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) ||
+ nla_put_string(skb, attr, name) ||
+ nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) ||
+ nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
+ goto err;
+
+ nlmsg_end(skb, nlh);
+ nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV,
+ nlmsg_report(nlh), GFP_KERNEL);
+ return;
+err:
+ if (skb)
+ kfree_skb(skb);
+ nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS);
+}
+
+void
+nf_tables_chain_device_notify(const struct nft_chain *chain,
+ const struct nft_hook *hook,
+ const struct net_device *dev, int event)
+{
+ nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN,
+ chain->name, hook, dev, event);
+}
+
+static void
+nf_tables_flowtable_device_notify(const struct nft_flowtable *ft,
+ const struct nft_hook *hook,
+ const struct net_device *dev, int event)
{
+ nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE,
+ ft->name, hook, dev, event);
+}
+
+static int nft_flowtable_event(unsigned long event, struct net_device *dev,
+ struct nft_flowtable *flowtable, bool changename)
+{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &flowtable->hook_list, list) {
- if (hook->ops.dev != dev)
- continue;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
- /* flow_offload_netdev_event() cleans up entries for us. */
- nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ /* flow_offload_netdev_event() cleans up entries for us. */
+ nft_unregister_flowtable_ops(dev_net(dev),
+ flowtable, ops);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable->hooknum;
+ ops->priority = flowtable->data.priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->dev = dev;
+ if (nft_register_flowtable_ops(dev_net(dev),
+ flowtable, ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
+ nf_tables_flowtable_device_notify(flowtable, hook, dev, event);
break;
}
+ return 0;
+}
+
+static int __nf_tables_flowtable_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nftables_pernet *nft_net = nft_pernet(dev_net(dev));
+ struct nft_flowtable *flowtable;
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ list_for_each_entry(flowtable, &table->flowtables, list) {
+ if (nft_flowtable_event(event, dev,
+ flowtable, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_flowtable_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct nft_flowtable *flowtable;
struct nftables_pernet *nft_net;
- struct nft_table *table;
+ int ret = NOTIFY_DONE;
struct net *net;
- if (event != NETDEV_UNREGISTER)
- return 0;
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
net = dev_net(dev);
nft_net = nft_pernet(net);
mutex_lock(&nft_net->commit_mutex);
- list_for_each_entry(table, &nft_net->tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- nft_flowtable_event(event, dev, flowtable);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_flowtable_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
+out_unlock:
mutex_unlock(&nft_net->commit_mutex);
-
- return NOTIFY_DONE;
+ return ret;
}
static struct notifier_block nf_tables_flowtable_notifier = {
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 64675f1c7f29..fd30e205de84 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
bool nft_chain_offload_support(const struct nft_base_chain *basechain)
{
+ struct nf_hook_ops *ops;
struct net_device *dev;
struct nft_hook *hook;
@@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain)
return false;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.pf != NFPROTO_NETDEV ||
- hook->ops.hooknum != NF_NETDEV_INGRESS)
- return false;
-
- dev = hook->ops.dev;
- if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
- return false;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->pf != NFPROTO_NETDEV ||
+ ops->hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = ops->dev;
+ if (!dev->netdev_ops->ndo_setup_tc &&
+ !flow_indr_dev_exists())
+ return false;
+ }
}
return true;
@@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain,
const struct net_device *this_dev,
enum flow_block_command cmd)
{
- struct net_device *dev;
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, i = 0;
list_for_each_entry(hook, &basechain->hook_list, list) {
- dev = hook->ops.dev;
- if (this_dev && this_dev != dev)
- continue;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (this_dev && this_dev != ops->dev)
+ continue;
- err = nft_chain_offload_cmd(basechain, dev, cmd);
- if (err < 0 && cmd == FLOW_BLOCK_BIND) {
- if (!this_dev)
- goto err_flow_block;
+ err = nft_chain_offload_cmd(basechain, ops->dev, cmd);
+ if (err < 0 && cmd == FLOW_BLOCK_BIND) {
+ if (!this_dev)
+ goto err_flow_block;
- return err;
+ return err;
+ }
+ i++;
}
- i++;
}
return 0;
err_flow_block:
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- dev = hook->ops.dev;
- nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
+ nft_chain_offload_cmd(basechain, ops->dev,
+ FLOW_BLOCK_UNBIND);
+ }
}
return err;
}
@@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n
found = NULL;
basechain = nft_base_chain(chain);
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev != dev)
+ if (!nft_hook_find_ops(hook, dev))
continue;
found = hook;
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 580c55268f65..ae3fe87195ab 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -15,6 +15,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
return 0;
}
+static int nf_trace_fill_ct_info(struct sk_buff *nlskb,
+ const struct sk_buff *skb)
+{
+ const struct nf_ct_hook *ct_hook;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ u32 state;
+
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook)
+ return 0;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) {
+ if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */
+ return 0;
+
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ } else {
+ state = NF_CT_STATE_BIT(ctinfo);
+ }
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state)))
+ return -1;
+
+ if (ct) {
+ u32 id = ct_hook->get_id(&ct->ct_general);
+ u32 status = READ_ONCE(ct->status);
+ u8 dir = CTINFO2DIR(ctinfo);
+
+ if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir))
+ return -1;
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id))
+ return -1;
+
+ if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status)))
+ return -1;
+ }
+
+ return 0;
+}
+
static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
@@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
- nla_total_size(sizeof(u32)) + /* id */
+ nla_total_size(sizeof(u32)) + /* ct id */
+ nla_total_size(sizeof(u8)) + /* ct direction */
+ nla_total_size(sizeof(u32)) + /* ct state */
+ nla_total_size(sizeof(u32)) + /* ct status */
+ nla_total_size(sizeof(u32)) + /* trace id */
nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
@@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
if (nf_trace_fill_pkt_info(skb, pkt))
goto nla_put_failure;
+
+ if (nf_trace_fill_ct_info(skb, pkt->skb))
+ goto nla_put_failure;
+
info->packet_dumped = true;
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e598a2a252b0..ac77fc21632d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES,
[NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT,
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
+ [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES,
};
static struct nfnl_net *nfnl_pernet(struct net *net)
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 19a553550c76..846d48ba8965 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -318,38 +318,68 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
},
};
-static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_base_chain *basechain)
+static int nft_netdev_event(unsigned long event, struct net_device *dev,
+ struct nft_base_chain *basechain, bool changename)
{
+ struct nft_table *table = basechain->chain.table;
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev != dev)
- continue;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), ops);
- if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT))
- nf_unregister_net_hook(dev_net(dev), &hook->ops);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ ops = kmemdup(&basechain->ops,
+ sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->dev = dev;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nf_register_net_hook(dev_net(dev), ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
+ nf_tables_chain_device_notify(&basechain->chain,
+ hook, dev, event);
break;
}
+ return 0;
}
-static int nf_tables_netdev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
+static int __nf_tables_netdev_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_base_chain *basechain;
struct nftables_pernet *nft_net;
struct nft_chain *chain;
struct nft_table *table;
- if (event != NETDEV_UNREGISTER)
- return NOTIFY_DONE;
-
nft_net = nft_pernet(dev_net(dev));
- mutex_lock(&nft_net->commit_mutex);
list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV &&
table->family != NFPROTO_INET)
@@ -364,12 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this,
basechain->ops.hooknum != NF_INET_INGRESS)
continue;
- nft_netdev_event(event, dev, basechain);
+ if (nft_netdev_event(event, dev, basechain, changename))
+ return 1;
}
}
- mutex_unlock(&nft_net->commit_mutex);
+ return 0;
+}
+
+static int nf_tables_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
+ int ret = NOTIFY_DONE;
- return NOTIFY_DONE;
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(dev_net(dev));
+ mutex_lock(&nft_net->commit_mutex);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
+ }
+ __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_netdev_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
+ }
+out_unlock:
+ mutex_unlock(&nft_net->commit_mutex);
+ return ret;
}
static struct notifier_block nf_tables_netdev_notifier = {
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 221d50223018..225ff293cd50 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev,
bool found = false;
list_for_each_entry_rcu(hook, &ft->hook_list, list) {
- if (hook->ops.dev != dev)
+ if (!nft_hook_find_ops_rcu(hook, dev))
continue;
found = true;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 817ab978d24a..c4569d4b9228 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -23,7 +23,14 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
-static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+struct nft_inner_tun_ctx_locked {
+ struct nft_inner_tun_ctx ctx;
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/* Same layout as nft_expr but it embeds the private expression data area. */
struct __nft_expr {
@@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
- this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
local_bh_enable();
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
return false;
}
*tun_ctx = *this_cpu_tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
return true;
@@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
- this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
*this_cpu_tun_ctx = *tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
}
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 9b2d7463d3d3..df0798da2329 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -19,10 +19,16 @@ struct nft_quota {
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ bool *report)
{
- return atomic64_add_return(skb->len, priv->consumed) >=
- atomic64_read(&priv->quota);
+ u64 consumed = atomic64_add_return(skb->len, priv->consumed);
+ u64 quota = atomic64_read(&priv->quota);
+
+ if (report)
+ *report = consumed >= quota;
+
+ return consumed > quota;
}
static inline bool nft_quota_invert(struct nft_quota *priv)
@@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
+ if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
@@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj,
const struct nft_pktinfo *pkt)
{
struct nft_quota *priv = nft_obj_data(obj);
- bool overquota;
+ bool overquota, report;
- overquota = nft_overquota(priv, pkt->skb);
+ overquota = nft_overquota(priv, pkt->skb, &report);
if (overquota ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
- if (overquota &&
+ if (report &&
!test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0,
NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC);
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 7be342b495f5..c5855069bdab 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -663,6 +663,9 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f,
check_add_overflow(rules, extra, &rules_alloc))
return -EOVERFLOW;
+ if (rules_alloc > (INT_MAX / sizeof(*new_mt)))
+ return -ENOMEM;
+
new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT);
if (!new_mt)
return -ENOMEM;
@@ -683,6 +686,30 @@ out_free:
return 0;
}
+
+/**
+ * lt_calculate_size() - Get storage size for lookup table with overflow check
+ * @groups: Amount of bit groups
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @bsize: Size of each bucket in lookup table, in longs
+ *
+ * Return: allocation size including alignment overhead, negative on overflow
+ */
+static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb,
+ unsigned int bsize)
+{
+ ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long);
+
+ if (check_mul_overflow(ret, bsize, &ret))
+ return -1;
+ if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret))
+ return -1;
+ if (ret > INT_MAX)
+ return -1;
+
+ return ret;
+}
+
/**
* pipapo_resize() - Resize lookup or mapping table, or both
* @f: Field containing lookup and mapping tables
@@ -701,6 +728,7 @@ static int pipapo_resize(struct nft_pipapo_field *f,
long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
unsigned int new_bucket_size, copy;
int group, bucket, err;
+ ssize_t lt_size;
if (rules >= NFT_PIPAPO_RULE0_MAX)
return -ENOSPC;
@@ -719,10 +747,11 @@ static int pipapo_resize(struct nft_pipapo_field *f,
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
- new_bucket_size * sizeof(*new_lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size);
+ if (lt_size < 0)
+ return -ENOMEM;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return -ENOMEM;
@@ -907,7 +936,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
{
unsigned int groups, bb;
unsigned long *new_lt;
- size_t lt_size;
+ ssize_t lt_size;
lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
sizeof(*f->lt);
@@ -917,15 +946,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
groups = f->groups * 2;
bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
} else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
groups = f->groups / 2;
bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
/* Don't increase group width if the resulting lookup table size
* would exceed the upper size threshold for a "small" set.
@@ -936,7 +967,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
return;
}
- new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT);
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return;
@@ -1451,13 +1482,15 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
for (i = 0; i < old->field_count; i++) {
unsigned long *new_lt;
+ ssize_t lt_size;
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
- src->bsize * sizeof(*dst->lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL_ACCOUNT);
+ lt_size = lt_calculate_size(src->groups, src->bb, src->bsize);
+ if (lt_size < 0)
+ goto out_lt;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
goto out_lt;
@@ -1469,6 +1502,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
src->groups * NFT_PIPAPO_BUCKETS(src->bb));
if (src->rules > 0) {
+ if (src->rules_alloc > (INT_MAX / sizeof(*src->mt)))
+ goto out_mt;
+
dst->mt = kvmalloc_array(src->rules_alloc,
sizeof(*src->mt),
GFP_KERNEL_ACCOUNT);
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 0c63d1367cf7..a12486ae089d 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -621,10 +621,10 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
struct geneve_opt *opt;
int offset = 0;
- inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
- if (!inner)
- goto failure;
while (opts->len > offset) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
opt = (struct geneve_opt *)(opts->u.data + offset);
if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
opt->opt_class) ||
@@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
opt->length * 4, opt->opt_data))
goto inner_failure;
offset += sizeof(*opt) + opt->length * 4;
+ nla_nest_end(skb, inner);
}
- nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 9f54819eb52c..9082155ee558 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -168,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
INIT_WORK(&info->timer->work, idletimer_tg_work);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return 0;
@@ -229,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
} else {
timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return 0;
@@ -254,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
info->label, info->timeout);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return XT_CONTINUE;
}
@@ -275,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
alarm_start_relative(&info->timer->alarm, tout);
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return XT_CONTINUE;
@@ -320,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
if (info->timer) {
info->timer->refcnt++;
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -382,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
}
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 30e99464171b..93f064306901 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index c0f5e9a4f3c6..c437fbd59ec1 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching");
MODULE_ALIAS("ipt_cgroup");
MODULE_ALIAS("ip6t_cgroup");
+#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n"
+
static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
{
struct xt_cgroup_info_v0 *info = par->matchinfo;
@@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
if (info->invert & ~1)
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
return 0;
}
@@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
+#ifdef CONFIG_CGROUP_NET_CLASSID
const struct xt_cgroup_info_v0 *info = par->matchinfo;
struct sock *sk = skb->sk;
@@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
info->invert;
+#endif
+ return false;
}
static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
@@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
@@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 65b965ca40ea..59b9d04400ca 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_mark_tginfo2),
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES)
+#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)
{
.name = "MARK",
.revision = 2,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index cd9160bbc919..6ea16138582c 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -1165,6 +1165,9 @@ int netlbl_conn_setattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
+ if (sk->sk_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
addr6 = (struct sockaddr_in6 *)addr;
entry = netlbl_domhsh_getentry_af6(secattr->domain,
&addr6->sin6_addr);
diff --git a/net/netlink/policy.c b/net/netlink/policy.c
index 1f8909c16f14..99458da6be32 100644
--- a/net/netlink/policy.c
+++ b/net/netlink/policy.c
@@ -311,6 +311,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
NL_POLICY_TYPE_ATTR_PAD))
goto nla_put_failure;
break;
+ } else if (pt->validation_type == NLA_VALIDATE_FUNCTION) {
+ break;
}
nla_get_range_unsigned(pt, &range);
@@ -340,6 +342,9 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state,
else
type = NL_ATTR_TYPE_SINT;
+ if (pt->validation_type == NLA_VALIDATE_FUNCTION)
+ break;
+
nla_get_range_signed(pt, &range);
if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S,
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 5481bd561eb4..e6aaee92dba4 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -11,8 +11,8 @@ config OPENVSWITCH
(!NF_NAT || NF_NAT) && \
(!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
depends on PSAMPLE || !PSAMPLE
- select CRC32
select MPLS
+ select NET_CRC32C
select NET_MPLS_GSO
select DST_CACHE
select NET_NSH
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2f22ca59586f..e7269a3eec79 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -39,56 +39,18 @@
#include "flow_netlink.h"
#include "openvswitch_trace.h"
-struct deferred_action {
- struct sk_buff *skb;
- const struct nlattr *actions;
- int actions_len;
-
- /* Store pkt_key clone when creating deferred action. */
- struct sw_flow_key pkt_key;
-};
-
-#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
-struct ovs_frag_data {
- unsigned long dst;
- struct vport *vport;
- struct ovs_skb_cb cb;
- __be16 inner_protocol;
- u16 network_offset; /* valid only for MPLS */
- u16 vlan_tci;
- __be16 vlan_proto;
- unsigned int l2_len;
- u8 mac_proto;
- u8 l2_data[MAX_L2_LEN];
-};
-
-static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
-
-#define DEFERRED_ACTION_FIFO_SIZE 10
-#define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
-struct action_fifo {
- int head;
- int tail;
- /* Deferred action fifo queue storage. */
- struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
+DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
-struct action_flow_keys {
- struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
-};
-
-static struct action_fifo __percpu *action_fifos;
-static struct action_flow_keys __percpu *flow_keys;
-static DEFINE_PER_CPU(int, exec_actions_level);
-
/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
* space. Return NULL if out of key spaces.
*/
static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
{
- struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
- int level = this_cpu_read(exec_actions_level);
+ struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage);
+ struct action_flow_keys *keys = &ovs_pcpu->flow_keys;
+ int level = ovs_pcpu->exec_level;
struct sw_flow_key *key = NULL;
if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
@@ -132,10 +94,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
const struct nlattr *actions,
const int actions_len)
{
- struct action_fifo *fifo;
+ struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos);
struct deferred_action *da;
- fifo = this_cpu_ptr(action_fifos);
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
@@ -794,7 +755,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
static int ovs_vport_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
+ struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data);
struct vport *vport = data->vport;
if (skb_cow_head(skb, data->l2_len) < 0) {
@@ -846,7 +807,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
- data = this_cpu_ptr(&ovs_frag_data_storage);
+ data = this_cpu_ptr(&ovs_pcpu_storage.frag_data);
data->dst = skb->_skb_refdst;
data->vport = vport;
data->cb = *OVS_CB(skb);
@@ -1608,13 +1569,13 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
if (actions) { /* Sample action */
if (clone_flow_key)
- __this_cpu_inc(exec_actions_level);
+ __this_cpu_inc(ovs_pcpu_storage.exec_level);
err = do_execute_actions(dp, skb, clone,
actions, len);
if (clone_flow_key)
- __this_cpu_dec(exec_actions_level);
+ __this_cpu_dec(ovs_pcpu_storage.exec_level);
} else { /* Recirc action */
clone->recirc_id = recirc_id;
ovs_dp_process_packet(skb, clone);
@@ -1650,7 +1611,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
static void process_deferred_actions(struct datapath *dp)
{
- struct action_fifo *fifo = this_cpu_ptr(action_fifos);
+ struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
@@ -1681,7 +1642,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
{
int err, level;
- level = __this_cpu_inc_return(exec_actions_level);
+ level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level);
if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
ovs_dp_name(dp));
@@ -1698,27 +1659,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
process_deferred_actions(dp);
out:
- __this_cpu_dec(exec_actions_level);
+ __this_cpu_dec(ovs_pcpu_storage.exec_level);
return err;
}
-
-int action_fifos_init(void)
-{
- action_fifos = alloc_percpu(struct action_fifo);
- if (!action_fifos)
- return -ENOMEM;
-
- flow_keys = alloc_percpu(struct action_flow_keys);
- if (!flow_keys) {
- free_percpu(action_fifos);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void action_fifos_exit(void)
-{
- free_percpu(action_fifos);
- free_percpu(flow_keys);
-}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 5d548eda742d..6a304ae2d959 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -244,11 +244,13 @@ void ovs_dp_detach_port(struct vport *p)
/* Must be called with rcu_read_lock. */
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
+ struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage);
const struct vport *p = OVS_CB(skb)->input_vport;
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct dp_stats_percpu *stats;
+ bool ovs_pcpu_locked = false;
u64 *stats_counter;
u32 n_mask_hit;
u32 n_cache_hit;
@@ -290,10 +292,26 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
+ /* This path can be invoked recursively: Use the current task to
+ * identify recursive invocation - the lock must be acquired only once.
+ * Even with disabled bottom halves this can be preempted on PREEMPT_RT.
+ * Limit the locking to RT to avoid assigning `owner' if it can be
+ * avoided.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) {
+ local_lock_nested_bh(&ovs_pcpu_storage.bh_lock);
+ ovs_pcpu->owner = current;
+ ovs_pcpu_locked = true;
+ }
+
error = ovs_execute_actions(dp, skb, sf_acts, key);
if (unlikely(error))
net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n",
ovs_dp_name(dp), error);
+ if (ovs_pcpu_locked) {
+ ovs_pcpu->owner = NULL;
+ local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock);
+ }
stats_counter = &stats->n_hit;
@@ -671,7 +689,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
sf_acts = rcu_dereference(flow->sf_acts);
local_bh_disable();
+ local_lock_nested_bh(&ovs_pcpu_storage.bh_lock);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ this_cpu_write(ovs_pcpu_storage.owner, current);
err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ this_cpu_write(ovs_pcpu_storage.owner, NULL);
+ local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock);
local_bh_enable();
rcu_read_unlock();
@@ -2729,13 +2753,9 @@ static int __init dp_init(void)
pr_info("Open vSwitch switching datapath\n");
- err = action_fifos_init();
- if (err)
- goto error;
-
err = ovs_internal_dev_rtnl_link_register();
if (err)
- goto error_action_fifos_exit;
+ goto error;
err = ovs_flow_init();
if (err)
@@ -2778,8 +2798,6 @@ error_flow_exit:
ovs_flow_exit();
error_unreg_rtnl_link:
ovs_internal_dev_rtnl_link_unregister();
-error_action_fifos_exit:
- action_fifos_exit();
error:
return err;
}
@@ -2795,7 +2813,6 @@ static void dp_cleanup(void)
ovs_vport_exit();
ovs_flow_exit();
ovs_internal_dev_rtnl_link_unregister();
- action_fifos_exit();
}
module_init(dp_init);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 384ca77f4e79..1b5348b0f559 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -13,6 +13,7 @@
#include <linux/skbuff.h>
#include <linux/u64_stats_sync.h>
#include <net/ip_tunnels.h>
+#include <net/mpls.h>
#include "conntrack.h"
#include "flow.h"
@@ -173,6 +174,54 @@ struct ovs_net {
bool xt_label;
};
+#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
+struct ovs_frag_data {
+ unsigned long dst;
+ struct vport *vport;
+ struct ovs_skb_cb cb;
+ __be16 inner_protocol;
+ u16 network_offset; /* valid only for MPLS */
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ unsigned int l2_len;
+ u8 mac_proto;
+ u8 l2_data[MAX_L2_LEN];
+};
+
+struct deferred_action {
+ struct sk_buff *skb;
+ const struct nlattr *actions;
+ int actions_len;
+
+ /* Store pkt_key clone when creating deferred action. */
+ struct sw_flow_key pkt_key;
+};
+
+#define DEFERRED_ACTION_FIFO_SIZE 10
+#define OVS_RECURSION_LIMIT 5
+#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+
+struct action_fifo {
+ int head;
+ int tail;
+ /* Deferred action fifo queue storage. */
+ struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
+};
+
+struct action_flow_keys {
+ struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+};
+
+struct ovs_pcpu_storage {
+ struct action_fifo action_fifos;
+ struct action_flow_keys flow_keys;
+ struct ovs_frag_data frag_data;
+ int exec_level;
+ struct task_struct *owner;
+ local_lock_t bh_lock;
+};
+DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage);
+
/**
* enum ovs_pkt_hash_types - hash info to include with a packet
* to send to userspace.
@@ -281,9 +330,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
void ovs_dp_notify_wq(struct work_struct *work);
-int action_fifos_init(void);
-void action_fifos_exit(void);
-
/* 'KEY' must not have any bits set outside of the 'MASK' */
#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK))
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8a848ce72e29..b80bd3a90773 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -788,7 +788,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
memset(&key->ipv4, 0, sizeof(key->ipv4));
}
} else if (eth_p_mpls(key->eth.type)) {
- u8 label_count = 1;
+ size_t label_count = 1;
memset(&key->mpls, 0, sizeof(key->mpls));
skb_set_inner_network_header(skb, skb->mac_len);
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 518be23e48ea..ad64bb9ab5e2 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -3049,7 +3049,8 @@ static int validate_userspace(const struct nlattr *attr)
struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
int error;
- error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr,
+ error = nla_parse_deprecated_strict(a, OVS_USERSPACE_ATTR_MAX,
+ nla_data(attr), nla_len(attr),
userspace_policy, NULL);
if (error)
return error;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d4dba06297c3..20be2c47cf41 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3713,15 +3713,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
}
static void packet_dev_mclist_delete(struct net_device *dev,
- struct packet_mclist **mlp)
+ struct packet_mclist **mlp,
+ struct list_head *list)
{
struct packet_mclist *ml;
while ((ml = *mlp) != NULL) {
if (ml->ifindex == dev->ifindex) {
- packet_dev_mc(dev, ml, -1);
+ list_add(&ml->remove_list, list);
*mlp = ml->next;
- kfree(ml);
} else
mlp = &ml->next;
}
@@ -3769,6 +3769,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
memcpy(i->addr, mreq->mr_address, i->alen);
memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
i->count = 1;
+ INIT_LIST_HEAD(&i->remove_list);
i->next = po->mclist;
po->mclist = i;
err = packet_dev_mc(dev, i, 1);
@@ -4233,9 +4234,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
static int packet_notifier(struct notifier_block *this,
unsigned long msg, void *ptr)
{
- struct sock *sk;
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
+ struct packet_mclist *ml, *tmp;
+ LIST_HEAD(mclist);
+ struct sock *sk;
rcu_read_lock();
sk_for_each_rcu(sk, &net->packet.sklist) {
@@ -4244,7 +4247,8 @@ static int packet_notifier(struct notifier_block *this,
switch (msg) {
case NETDEV_UNREGISTER:
if (po->mclist)
- packet_dev_mclist_delete(dev, &po->mclist);
+ packet_dev_mclist_delete(dev, &po->mclist,
+ &mclist);
fallthrough;
case NETDEV_DOWN:
@@ -4277,6 +4281,13 @@ static int packet_notifier(struct notifier_block *this,
}
}
rcu_read_unlock();
+
+ /* packet_dev_mc might grab instance locks so can't run under rcu */
+ list_for_each_entry_safe(ml, tmp, &mclist, remove_list) {
+ packet_dev_mc(dev, ml, -1);
+ kfree(ml);
+ }
+
return NOTIFY_DONE;
}
diff --git a/net/packet/internal.h b/net/packet/internal.h
index d5d70712007a..1e743d0316fd 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -11,6 +11,7 @@ struct packet_mclist {
unsigned short type;
unsigned short alen;
unsigned char addr[MAX_ADDR_LEN];
+ struct list_head remove_list;
};
/* kbdq - kernel block descriptor queue */
diff --git a/net/rds/connection.c b/net/rds/connection.c
index c749c5525b40..d62f486ab29f 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -749,8 +749,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
cinfo->laddr = conn->c_laddr.s6_addr32[3];
cinfo->faddr = conn->c_faddr.s6_addr32[3];
cinfo->tos = conn->c_tos;
- strncpy(cinfo->transport, conn->c_trans->t_name,
- sizeof(cinfo->transport));
+ strscpy_pad(cinfo->transport, conn->c_trans->t_name);
cinfo->flags = 0;
rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
@@ -775,8 +774,7 @@ static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
cinfo6->next_rx_seq = cp->cp_next_rx_seq;
cinfo6->laddr = conn->c_laddr;
cinfo6->faddr = conn->c_faddr;
- strncpy(cinfo6->transport, conn->c_trans->t_name,
- sizeof(cinfo6->transport));
+ strscpy_pad(cinfo6->transport, conn->c_trans->t_name);
cinfo6->flags = 0;
rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
diff --git a/net/rds/page.c b/net/rds/page.c
index 7cc57e098ddb..afb151eac271 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,10 +40,12 @@
struct rds_page_remainder {
struct page *r_page;
unsigned long r_offset;
+ local_lock_t bh_lock;
};
-static
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/**
* rds_page_remainder_alloc - build up regions of a message.
@@ -69,7 +71,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
gfp_t gfp)
{
struct rds_page_remainder *rem;
- unsigned long flags;
struct page *page;
int ret;
@@ -87,8 +88,9 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
goto out;
}
- rem = &per_cpu(rds_page_remainders, get_cpu());
- local_irq_save(flags);
+ local_bh_disable();
+ local_lock_nested_bh(&rds_page_remainders.bh_lock);
+ rem = this_cpu_ptr(&rds_page_remainders);
while (1) {
/* avoid a tiny region getting stuck by tossing it */
@@ -116,13 +118,14 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
/* alloc if there is nothing for us to use */
- local_irq_restore(flags);
- put_cpu();
+ local_unlock_nested_bh(&rds_page_remainders.bh_lock);
+ local_bh_enable();
page = alloc_page(gfp);
- rem = &per_cpu(rds_page_remainders, get_cpu());
- local_irq_save(flags);
+ local_bh_disable();
+ local_lock_nested_bh(&rds_page_remainders.bh_lock);
+ rem = this_cpu_ptr(&rds_page_remainders);
if (!page) {
ret = -ENOMEM;
@@ -140,8 +143,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem->r_offset = 0;
}
- local_irq_restore(flags);
- put_cpu();
+ local_unlock_nested_bh(&rds_page_remainders.bh_lock);
+ local_bh_enable();
out:
rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index a20986806fea..f60b81c66078 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -67,6 +67,29 @@ config RXKAD
See Documentation/networking/rxrpc.rst.
+config RXGK
+ bool "RxRPC GSSAPI security"
+ select CRYPTO_KRB5
+ select CRYPTO_MANAGER
+ select CRYPTO_KRB5ENC
+ select CRYPTO_AUTHENC
+ select CRYPTO_SKCIPHER
+ select CRYPTO_HASH_INFO
+ select CRYPTO_HMAC
+ select CRYPTO_CMAC
+ select CRYPTO_SHA1
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_CBC
+ select CRYPTO_CTS
+ select CRYPTO_AES
+ select CRYPTO_CAMELLIA
+ help
+ Provide the GSSAPI-based RxGK security class for AFS. Keys are added
+ with add_key().
+
+ See Documentation/networking/rxrpc.rst.
+
config RXPERF
tristate "RxRPC test service"
help
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 210b75e3179e..c0542bae719e 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -24,6 +24,7 @@ rxrpc-y := \
local_object.o \
misc.o \
net_ns.o \
+ oob.o \
output.o \
peer_event.o \
peer_object.o \
@@ -39,6 +40,9 @@ rxrpc-y := \
rxrpc-$(CONFIG_PROC_FS) += proc.o
rxrpc-$(CONFIG_RXKAD) += rxkad.o
rxrpc-$(CONFIG_SYSCTL) += sysctl.o
-
+rxrpc-$(CONFIG_RXGK) += \
+ rxgk.o \
+ rxgk_app.o \
+ rxgk_kdf.o
obj-$(CONFIG_RXPERF) += rxperf.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 86873399f7d5..36df0274d7b7 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog)
* @gfp: Allocation flags
*
* Lookup or create a remote transport endpoint record for the specified
- * address and return it with a ref held.
+ * address.
+ *
+ * Return: The peer record found with a reference, %NULL if no record is found
+ * or a negative error code if the address is invalid or unsupported.
*/
struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
struct sockaddr_rxrpc *srx, gfp_t gfp)
@@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer);
/**
* rxrpc_kernel_get_peer - Get a reference on a peer
- * @peer: The peer to get a reference on.
+ * @peer: The peer to get a reference on (may be NULL).
+ *
+ * Get a reference for a remote peer record (if not NULL).
*
- * Get a record for the remote peer in a call.
+ * Return: The @peer argument.
*/
struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer)
{
@@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer);
/**
* rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference
* @peer: The peer to drop a ref on
+ *
+ * Drop a reference on a peer record.
*/
void rxrpc_kernel_put_peer(struct rxrpc_peer *peer)
{
@@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer);
*
* Allow a kernel service to begin a call on the nominated socket. This just
* sets up all the internal tracking structures and allocates connection and
- * call IDs as appropriate. The call to be used is returned.
+ * call IDs as appropriate.
*
* The default socket destination address and security may be overridden by
* supplying @srx and @key.
+ *
+ * Return: The new call or an error code.
*/
struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
struct rxrpc_peer *peer,
@@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call);
*
* Allow a kernel service to find out whether a call is still alive - whether
* it has completed successfully and all received data has been consumed.
+ *
+ * Return: %true if the call is still ongoing and %false if it has completed.
*/
bool rxrpc_kernel_check_life(const struct socket *sock,
const struct rxrpc_call *call)
@@ -450,63 +461,20 @@ bool rxrpc_kernel_check_life(const struct socket *sock,
EXPORT_SYMBOL(rxrpc_kernel_check_life);
/**
- * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
- * @sock: The socket the call is on
- * @call: The call to query
- *
- * Allow a kernel service to retrieve the epoch value from a service call to
- * see if the client at the other end rebooted.
- */
-u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call)
-{
- return call->conn->proto.epoch;
-}
-EXPORT_SYMBOL(rxrpc_kernel_get_epoch);
-
-/**
- * rxrpc_kernel_new_call_notification - Get notifications of new calls
- * @sock: The socket to intercept received messages on
- * @notify_new_call: Function to be called when new calls appear
- * @discard_new_call: Function to discard preallocated calls
+ * rxrpc_kernel_set_notifications - Set table of callback operations
+ * @sock: The socket to install table upon
+ * @app_ops: Callback operation table to set
*
- * Allow a kernel service to be given notifications about new calls.
+ * Allow a kernel service to set a table of event notifications on a socket.
*/
-void rxrpc_kernel_new_call_notification(
- struct socket *sock,
- rxrpc_notify_new_call_t notify_new_call,
- rxrpc_discard_new_call_t discard_new_call)
+void rxrpc_kernel_set_notifications(struct socket *sock,
+ const struct rxrpc_kernel_ops *app_ops)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- rx->notify_new_call = notify_new_call;
- rx->discard_new_call = discard_new_call;
+ rx->app_ops = app_ops;
}
-EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
-
-/**
- * rxrpc_kernel_set_max_life - Set maximum lifespan on a call
- * @sock: The socket the call is on
- * @call: The call to configure
- * @hard_timeout: The maximum lifespan of the call in ms
- *
- * Set the maximum lifespan of a call. The call will end with ETIME or
- * ETIMEDOUT if it takes longer than this.
- */
-void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call,
- unsigned long hard_timeout)
-{
- ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by;
-
- mutex_lock(&call->user_mutex);
-
- expect_term_by = ktime_add(ktime_get_real(), delay);
- WRITE_ONCE(call->expect_term_by, expect_term_by);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard);
- rxrpc_poke_call(call, rxrpc_call_poke_set_timeout);
-
- mutex_unlock(&call->user_mutex);
-}
-EXPORT_SYMBOL(rxrpc_kernel_set_max_life);
+EXPORT_SYMBOL(rxrpc_kernel_set_notifications);
/*
* connect an RxRPC socket
@@ -624,7 +592,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
fallthrough;
case RXRPC_SERVER_BOUND:
case RXRPC_SERVER_LISTENING:
- ret = rxrpc_do_sendmsg(rx, m, len);
+ if (m->msg_flags & MSG_OOB)
+ ret = rxrpc_sendmsg_oob(rx, m, len);
+ else
+ ret = rxrpc_do_sendmsg(rx, m, len);
/* The socket has been unlocked */
goto out;
default:
@@ -659,7 +630,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- unsigned int min_sec_level;
+ unsigned int min_sec_level, val;
u16 service_upgrade[2];
int ret;
@@ -740,6 +711,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
rx->service_upgrade.to = service_upgrade[1];
goto success;
+ case RXRPC_MANAGE_RESPONSE:
+ ret = -EINVAL;
+ if (optlen != sizeof(unsigned int))
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNBOUND)
+ goto error;
+ ret = copy_safe_from_sockptr(&val, sizeof(val),
+ optval, optlen);
+ if (ret)
+ goto error;
+ ret = -EINVAL;
+ if (val > 1)
+ goto error;
+ if (val)
+ set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ else
+ clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ goto success;
+
default:
break;
}
@@ -846,6 +837,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
rx->calls = RB_ROOT;
spin_lock_init(&rx->incoming_lock);
+ skb_queue_head_init(&rx->recvmsg_oobq);
+ rx->pending_oobq = RB_ROOT;
INIT_LIST_HEAD(&rx->sock_calls);
INIT_LIST_HEAD(&rx->to_be_accepted);
INIT_LIST_HEAD(&rx->recvmsg_q);
@@ -879,8 +872,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags)
lock_sock(sk);
if (sk->sk_state < RXRPC_CLOSE) {
+ spin_lock_irq(&rx->recvmsg_lock);
sk->sk_state = RXRPC_CLOSE;
sk->sk_shutdown = SHUTDOWN_MASK;
+ spin_unlock_irq(&rx->recvmsg_lock);
} else {
ret = -ESHUTDOWN;
}
@@ -892,12 +887,30 @@ static int rxrpc_shutdown(struct socket *sock, int flags)
}
/*
+ * Purge the out-of-band queue.
+ */
+static void rxrpc_purge_oob_queue(struct sock *sk)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&rx->recvmsg_oobq)))
+ rxrpc_kernel_free_oob(skb);
+ while (!RB_EMPTY_ROOT(&rx->pending_oobq)) {
+ skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode);
+ rb_erase(&skb->rbnode, &rx->pending_oobq);
+ rxrpc_kernel_free_oob(skb);
+ }
+}
+
+/*
* RxRPC socket destructor
*/
static void rxrpc_sock_destructor(struct sock *sk)
{
_enter("%p", sk);
+ rxrpc_purge_oob_queue(sk);
rxrpc_purge_queue(&sk->sk_receive_queue);
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
@@ -936,7 +949,9 @@ static int rxrpc_release_sock(struct sock *sk)
break;
}
+ spin_lock_irq(&rx->recvmsg_lock);
sk->sk_state = RXRPC_CLOSE;
+ spin_unlock_irq(&rx->recvmsg_lock);
if (rx->local && rx->local->service == rx) {
write_lock(&rx->local->services_lock);
@@ -948,6 +963,7 @@ static int rxrpc_release_sock(struct sock *sk)
rxrpc_discard_prealloc(rx);
rxrpc_release_calls_on_socket(rx);
flush_workqueue(rxrpc_workqueue);
+ rxrpc_purge_oob_queue(sk);
rxrpc_purge_queue(&sk->sk_receive_queue);
rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3cc3af15086f..5bd3922c310d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -31,6 +31,7 @@ struct key_preparsed_payload;
struct rxrpc_connection;
struct rxrpc_txbuf;
struct rxrpc_txqueue;
+struct rxgk_context;
/*
* Mark applied to socket buffers in skb->mark. skb->priority is used
@@ -39,6 +40,7 @@ struct rxrpc_txqueue;
enum rxrpc_skb_mark {
RXRPC_SKB_MARK_PACKET, /* Received packet */
RXRPC_SKB_MARK_ERROR, /* Error notification */
+ RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */
RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */
RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */
RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */
@@ -146,10 +148,12 @@ struct rxrpc_backlog {
struct rxrpc_sock {
/* WARNING: sk has to be the first member */
struct sock sk;
- rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */
- rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */
+ const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */
struct rxrpc_local *local; /* local endpoint */
struct rxrpc_backlog *backlog; /* Preallocation for services */
+ struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */
+ struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */
+ u64 oob_id_counter; /* OOB message ID counter */
spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */
struct list_head sock_calls; /* List of calls owned by this socket */
struct list_head to_be_accepted; /* calls awaiting acceptance */
@@ -160,6 +164,7 @@ struct rxrpc_sock {
struct rb_root calls; /* User ID -> call mapping */
unsigned long flags;
#define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */
+#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */
rwlock_t call_lock; /* lock for calls */
u32 min_sec_level; /* minimum security level */
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
@@ -203,7 +208,7 @@ struct rxrpc_host_header {
*/
struct rxrpc_skb_priv {
union {
- struct rxrpc_connection *conn; /* Connection referred to (poke packet) */
+ struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */
struct {
u16 offset; /* Offset of data */
u16 len; /* Length of data */
@@ -217,6 +222,19 @@ struct rxrpc_skb_priv {
u16 nr_acks; /* Number of acks+nacks */
u8 reason; /* Reason for ack */
} ack;
+ struct {
+ struct rxrpc_connection *conn; /* Connection referred to */
+ union {
+ u32 rxkad_nonce;
+ };
+ } chall;
+ struct {
+ rxrpc_serial_t challenge_serial;
+ u32 kvno;
+ u32 version;
+ u16 len;
+ u16 ticket_len;
+ } resp;
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
};
@@ -270,9 +288,24 @@ struct rxrpc_security {
/* issue a challenge */
int (*issue_challenge)(struct rxrpc_connection *);
+ /* Validate a challenge packet */
+ bool (*validate_challenge)(struct rxrpc_connection *conn,
+ struct sk_buff *skb);
+
+ /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace.
+ * The security class gets to add additional information.
+ */
+ int (*challenge_to_recvmsg)(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct msghdr *msg);
+
+ /* Parse sendmsg() control message and respond to challenge. */
+ int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge,
+ struct msghdr *msg);
+
/* respond to a challenge */
- int (*respond_to_challenge)(struct rxrpc_connection *,
- struct sk_buff *);
+ int (*respond_to_challenge)(struct rxrpc_connection *conn,
+ struct sk_buff *challenge);
/* verify a response */
int (*verify_response)(struct rxrpc_connection *,
@@ -280,6 +313,11 @@ struct rxrpc_security {
/* clear connection security */
void (*clear)(struct rxrpc_connection *);
+
+ /* Default ticket -> key decoder */
+ int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key);
};
/*
@@ -526,7 +564,17 @@ struct rxrpc_connection {
struct rxrpc_crypt csum_iv; /* packet checksum base */
u32 nonce; /* response re-use preventer */
} rxkad;
+ struct {
+ struct rxgk_context *keys[4]; /* (Re-)keying buffer */
+ u64 start_time; /* The start time for TK derivation */
+ u8 nonce[20]; /* Response re-use preventer */
+ u32 enctype; /* Kerberos 5 encoding type */
+ u32 key_number; /* Current key number */
+ } rxgk;
};
+ rwlock_t security_use_lock; /* Security use/modification lock */
+ struct sk_buff *tx_response; /* Response packet to be transmitted */
+
unsigned long flags;
unsigned long events;
unsigned long idle_timestamp; /* Time at which last became idle */
@@ -692,6 +740,7 @@ struct rxrpc_call {
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
u32 security_level; /* Security level selected */
+ u32 security_enctype; /* Security-specific encoding type (or 0) */
int debug_id; /* debug ID for printks */
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
@@ -867,6 +916,8 @@ struct rxrpc_txbuf {
unsigned short len; /* Amount of data in buffer */
unsigned short space; /* Remaining data space */
unsigned short offset; /* Offset of fill point */
+ unsigned short crypto_header; /* Size of crypto header */
+ unsigned short sec_header; /* Size of security header */
unsigned short pkt_len; /* Size of packet content */
unsigned short alloc_size; /* Amount of bufferage allocated */
unsigned int flags;
@@ -1001,7 +1052,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
struct rxrpc_conn_parameters *,
struct rxrpc_call_params *, gfp_t,
- unsigned int);
+ unsigned int)
+ __releases(&rx->sk.sk_lock)
+ __acquires(&call->user_mutex);
void rxrpc_start_call_timer(struct rxrpc_call *call);
void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
struct sk_buff *);
@@ -1198,8 +1251,11 @@ void rxrpc_error_report(struct sock *);
bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why,
s32 abort_code, int err);
int rxrpc_io_thread(void *data);
+void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb);
static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local)
{
+ if (!local->io_thread)
+ return;
wake_up_process(READ_ONCE(local->io_thread));
}
@@ -1289,8 +1345,16 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
}
/*
+ * out_of_band.c
+ */
+void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb);
+void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb);
+int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len);
+
+/*
* output.c
*/
+ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len);
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
@@ -1299,6 +1363,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
void rxrpc_send_keepalive(struct rxrpc_peer *);
+void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb);
/*
* peer_event.c
@@ -1363,6 +1428,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans);
void rxrpc_call_init_rtt(struct rxrpc_call *call);
/*
+ * rxgk.c
+ */
+extern const struct rxrpc_security rxgk_yfs;
+
+/*
* rxkad.c
*/
#ifdef CONFIG_RXKAD
@@ -1433,7 +1503,6 @@ static inline void rxrpc_sysctl_exit(void) {}
extern atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp);
-void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index e685034ce4f7..a4b363b47cca 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call,
static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
struct rxrpc_backlog *b,
rxrpc_notify_rx_t notify_rx,
- rxrpc_user_attach_call_t user_attach_call,
unsigned long user_call_ID, gfp_t gfp,
unsigned int debug_id)
{
@@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
call->user_call_ID = user_call_ID;
call->notify_rx = notify_rx;
- if (user_attach_call) {
+ if (rx->app_ops &&
+ rx->app_ops->user_attach_call) {
rxrpc_get_call(call, rxrpc_call_get_kernel_service);
- user_attach_call(call, user_call_ID);
+ rx->app_ops->user_attach_call(call, user_call_ID);
}
rxrpc_get_call(call, rxrpc_call_get_userid);
@@ -219,9 +219,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
while (CIRC_CNT(head, tail, size) > 0) {
struct rxrpc_call *call = b->call_backlog[tail];
rcu_assign_pointer(call->socket, rx);
- if (rx->discard_new_call) {
+ if (rx->app_ops &&
+ rx->app_ops->discard_new_call) {
_debug("discard %lx", call->user_call_ID);
- rx->discard_new_call(call, call->user_call_ID);
+ rx->app_ops->discard_new_call(call, call->user_call_ID);
if (call->notify_rx)
call->notify_rx = rxrpc_dummy_notify;
rxrpc_put_call(call, rxrpc_call_put_kernel);
@@ -387,8 +388,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
rxrpc_incoming_call(rx, call, skb);
conn = call->conn;
- if (rx->notify_new_call)
- rx->notify_new_call(&rx->sk, call, call->user_call_ID);
+ if (rx->app_ops &&
+ rx->app_ops->notify_new_call)
+ rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID);
spin_lock(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) {
@@ -440,8 +442,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID)
if (rx->sk.sk_state == RXRPC_CLOSE)
return -ESHUTDOWN;
- return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID,
- GFP_KERNEL,
+ return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL,
atomic_inc_return(&rxrpc_debug_id));
}
@@ -449,20 +450,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID)
* rxrpc_kernel_charge_accept - Charge up socket with preallocated calls
* @sock: The socket on which to preallocate
* @notify_rx: Event notification function for the call
- * @user_attach_call: Func to attach call to user_call_ID
* @user_call_ID: The tag to attach to the preallocated call
* @gfp: The allocation conditions.
* @debug_id: The tracing debug ID.
*
- * Charge up the socket with preallocated calls, each with a user ID. A
- * function should be provided to effect the attachment from the user's side.
- * The user is given a ref to hold on the call.
+ * Charge up the socket with preallocated calls, each with a user ID. The
+ * ->user_attach_call() callback function should be provided to effect the
+ * attachment from the user's side. The user is given a ref to hold on the
+ * call.
*
* Note that the call may be come connected before this function returns.
*/
-int rxrpc_kernel_charge_accept(struct socket *sock,
- rxrpc_notify_rx_t notify_rx,
- rxrpc_user_attach_call_t user_attach_call,
+int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx,
unsigned long user_call_ID, gfp_t gfp,
unsigned int debug_id)
{
@@ -472,8 +471,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock,
if (sock->sk->sk_state == RXRPC_CLOSE)
return -ESHUTDOWN;
- return rxrpc_service_prealloc_one(rx, b, notify_rx,
- user_attach_call, user_call_ID,
+ return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID,
gfp, debug_id);
}
EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index fce58be65e7c..e9e8f0ef3fd5 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -145,8 +145,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
INIT_LIST_HEAD(&call->recvmsg_link);
INIT_LIST_HEAD(&call->sock_link);
INIT_LIST_HEAD(&call->attend_link);
- skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->recvmsg_queue);
+ skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->rx_oos_queue);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->notify_lock);
@@ -322,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
struct rxrpc_call_params *p,
gfp_t gfp,
unsigned int debug_id)
- __releases(&rx->sk.sk_lock.slock)
+ __releases(&rx->sk.sk_lock)
__acquires(&call->user_mutex)
{
struct rxrpc_call *call, *xcall;
@@ -760,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
atomic_dec(&rxnet->nr_calls);
wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls));
}
+
+/**
+ * rxrpc_kernel_query_call_security - Query call's security parameters
+ * @call: The call to query
+ * @_service_id: Where to return the service ID
+ * @_enctype: Where to return the "encoding type"
+ *
+ * This queries the security parameters of a call, setting *@_service_id and
+ * *@_enctype and returning the security class.
+ *
+ * Return: The security class protocol number.
+ */
+u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call,
+ u16 *_service_id, u32 *_enctype)
+{
+ *_service_id = call->dest_srx.srx_service;
+ *_enctype = call->security_enctype;
+ return call->security_ix;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_call_security);
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 4d9c5e21ba78..232b6986da83 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -19,7 +19,7 @@
/*
* Set the completion state on an aborted connection.
*/
-static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb,
+static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn,
s32 abort_code, int err,
enum rxrpc_call_completion compl)
{
@@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb,
s32 abort_code, int err, enum rxrpc_abort_reason why)
{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- if (rxrpc_set_conn_aborted(conn, skb, abort_code, err,
+ u32 cid = conn->proto.cid, call = 0, seq = 0;
+
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ cid = sp->hdr.cid;
+ call = sp->hdr.callNumber;
+ seq = sp->hdr.seq;
+ }
+
+ if (rxrpc_set_conn_aborted(conn, abort_code, err,
RXRPC_CALL_LOCALLY_ABORTED)) {
- trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber,
- sp->hdr.seq, abort_code, err);
+ trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err);
rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort);
}
return -EPROTO;
@@ -67,7 +75,7 @@ static void rxrpc_input_conn_abort(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
trace_rxrpc_rx_conn_abort(conn, skb);
- rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED,
+ rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED,
RXRPC_CALL_REMOTELY_ABORTED);
}
@@ -248,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_CHALLENGE:
- return conn->security->respond_to_challenge(conn, skb);
+ ret = conn->security->respond_to_challenge(conn, skb);
+ sp->chall.conn = NULL;
+ rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input);
+ return ret;
case RXRPC_PACKET_TYPE_RESPONSE:
ret = conn->security->verify_response(conn, skb);
@@ -270,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
* we've already received the packet, put it on the
* front of the queue.
*/
- sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured);
+ sp->poke_conn = rxrpc_get_connection(
+ conn, rxrpc_conn_get_poke_secured);
skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED;
rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured);
skb_queue_head(&conn->local->rx_queue, skb);
@@ -392,6 +404,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
}
/*
+ * Post a CHALLENGE packet to the socket of one of a connection's calls so that
+ * it can get application data to include in the packet, possibly querying
+ * userspace.
+ */
+static bool rxrpc_post_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call = NULL;
+ struct rxrpc_sock *rx;
+ bool respond = false;
+
+ sp->chall.conn =
+ rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input);
+
+ if (!conn->security->challenge_to_recvmsg) {
+ rxrpc_post_packet_to_conn(conn, skb);
+ return true;
+ }
+
+ rcu_read_lock();
+
+ for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) {
+ if (conn->channels[i].call) {
+ call = conn->channels[i].call;
+ rx = rcu_dereference(call->socket);
+ if (!rx) {
+ call = NULL;
+ continue;
+ }
+
+ respond = true;
+ if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags))
+ break;
+ call = NULL;
+ }
+ }
+
+ if (!respond) {
+ rcu_read_unlock();
+ rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input);
+ sp->chall.conn = NULL;
+ return false;
+ }
+
+ if (call)
+ rxrpc_notify_socket_oob(call, skb);
+ rcu_read_unlock();
+
+ if (!call)
+ rxrpc_post_packet_to_conn(conn, skb);
+ return true;
+}
+
+/*
* Input a connection-level packet.
*/
bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
@@ -411,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb)
return true;
case RXRPC_PACKET_TYPE_CHALLENGE:
+ rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge);
+ if (rxrpc_is_conn_aborted(conn)) {
+ if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED)
+ rxrpc_send_conn_abort(conn);
+ return true;
+ }
+ if (!conn->security->validate_challenge(conn, skb))
+ return false;
+ return rxrpc_post_challenge(conn, skb);
+
case RXRPC_PACKET_TYPE_RESPONSE:
if (rxrpc_is_conn_aborted(conn)) {
if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED)
@@ -436,6 +513,19 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb)
if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events))
rxrpc_abort_calls(conn);
+ if (conn->tx_response) {
+ struct sk_buff *skb;
+
+ spin_lock_irq(&conn->local->lock);
+ skb = conn->tx_response;
+ conn->tx_response = NULL;
+ spin_unlock_irq(&conn->local->lock);
+
+ if (conn->state != RXRPC_CONN_ABORTED)
+ rxrpc_send_response(conn, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_response);
+ }
+
if (skb) {
switch (skb->mark) {
case RXRPC_SKB_MARK_SERVICE_CONN_SECURED:
@@ -452,3 +542,31 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb)
if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK)
rxrpc_process_delayed_final_acks(conn, false);
}
+
+/*
+ * Post a RESPONSE message to the I/O thread for transmission.
+ */
+void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_local *local = conn->local;
+ struct sk_buff *old;
+
+ _enter("%x", sp->resp.challenge_serial);
+
+ spin_lock_irq(&local->lock);
+ old = conn->tx_response;
+ if (old) {
+ struct rxrpc_skb_priv *osp = rxrpc_skb(skb);
+
+ /* Always go with the response to the most recent challenge. */
+ if (after(sp->resp.challenge_serial, osp->resp.challenge_serial))
+ conn->tx_response = old;
+ else
+ old = skb;
+ } else {
+ conn->tx_response = skb;
+ }
+ spin_unlock_irq(&local->lock);
+ rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response);
+}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 8ac22dde8b39..37340becb224 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -73,6 +73,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet,
skb_queue_head_init(&conn->rx_queue);
conn->rxnet = rxnet;
conn->security = &rxrpc_no_security;
+ rwlock_init(&conn->security_use_lock);
spin_lock_init(&conn->state_lock);
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->idle_timestamp = jiffies;
@@ -329,6 +330,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
}
rxrpc_purge_queue(&conn->rx_queue);
+ rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response);
rxrpc_kill_client_conn(conn);
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index e068f9b79d02..1f7c136d6d0e 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -42,13 +42,19 @@ static void none_free_call_crypto(struct rxrpc_call *call)
{
}
-static int none_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+static bool none_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
rxrpc_eproto_rxnull_challenge);
}
+static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ return -EINVAL;
+}
+
static int none_verify_response(struct rxrpc_connection *conn,
struct sk_buff *skb)
{
@@ -82,7 +88,8 @@ const struct rxrpc_security rxrpc_no_security = {
.alloc_txbuf = none_alloc_txbuf,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
- .respond_to_challenge = none_respond_to_challenge,
+ .validate_challenge = none_validate_challenge,
+ .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge,
.verify_response = none_verify_response,
.clear = none_clear,
};
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 64f8d77b8731..27b650d30f4d 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -489,8 +489,8 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_error_report);
break;
case RXRPC_SKB_MARK_SERVICE_CONN_SECURED:
- rxrpc_input_conn_event(sp->conn, skb);
- rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke);
+ rxrpc_input_conn_event(sp->poke_conn, skb);
+ rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke);
rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured);
break;
default:
@@ -501,9 +501,11 @@ int rxrpc_io_thread(void *data)
}
/* Deal with connections that want immediate attention. */
- spin_lock_irq(&local->lock);
- list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
- spin_unlock_irq(&local->lock);
+ if (!list_empty_careful(&local->conn_attend_q)) {
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
+ spin_unlock_irq(&local->lock);
+ }
while ((conn = list_first_entry_or_null(&conn_attend_q,
struct rxrpc_connection,
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 33e8302a79e3..9fdc1f031c9d 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
return 0;
}
+static u64 xdr_dec64(const __be32 *xdr)
+{
+ return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]);
+}
+
+static time64_t rxrpc_s64_to_time64(s64 time_in_100ns)
+{
+ bool neg = false;
+ u64 tmp = time_in_100ns;
+
+ if (time_in_100ns < 0) {
+ tmp = -time_in_100ns;
+ neg = true;
+ }
+ do_div(tmp, 10000000);
+ return neg ? -tmp : tmp;
+}
+
+/*
+ * Parse a YFS-RxGK type XDR format token
+ * - the caller guarantees we have at least 4 words
+ *
+ * struct token_rxgk {
+ * opr_time begintime;
+ * opr_time endtime;
+ * afs_int64 level;
+ * afs_int64 lifetime;
+ * afs_int64 bytelife;
+ * afs_int64 enctype;
+ * opaque key<>;
+ * opaque ticket<>;
+ * };
+ */
+static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ time64_t expiry;
+ size_t plen;
+ const __be32 *ticket, *key;
+ s64 tmp;
+ u32 tktlen, keylen;
+
+ _enter(",{%x,%x,%x,%x},%x",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ if (6 * 2 + 2 > toklen / 4)
+ goto reject;
+
+ key = xdr + (6 * 2 + 1);
+ keylen = ntohl(key[-1]);
+ _debug("keylen: %x", keylen);
+ keylen = round_up(keylen, 4);
+ if ((6 * 2 + 2) * 4 + keylen > toklen)
+ goto reject;
+
+ ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1);
+ tktlen = ntohl(ticket[-1]);
+ _debug("tktlen: %x", tktlen);
+ tktlen = round_up(tktlen, 4);
+ if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) {
+ kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]",
+ (6 * 2 + 2) * 4 + keylen + tktlen, toklen,
+ keylen, tktlen);
+ goto reject;
+ }
+
+ plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen;
+ prep->quotalen = datalen + plen;
+
+ plen -= sizeof(*token);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ goto nomem;
+
+ token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL);
+ if (!token->rxgk)
+ goto nomem_token;
+
+ token->security_index = RXRPC_SECURITY_YFS_RXGK;
+ token->rxgk->begintime = xdr_dec64(xdr + 0 * 2);
+ token->rxgk->endtime = xdr_dec64(xdr + 1 * 2);
+ token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2);
+ if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT)
+ goto reject_token;
+ token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2);
+ token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2);
+ token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2);
+ if (tmp < 0 || tmp > UINT_MAX)
+ goto reject_token;
+ token->rxgk->key.len = ntohl(key[-1]);
+ token->rxgk->key.data = token->rxgk->_key;
+ token->rxgk->ticket.len = ntohl(ticket[-1]);
+
+ if (token->rxgk->endtime != 0) {
+ expiry = rxrpc_s64_to_time64(token->rxgk->endtime);
+ if (expiry < 0)
+ goto expired;
+ if (expiry < prep->expiry)
+ prep->expiry = expiry;
+ }
+
+ memcpy(token->rxgk->key.data, key, token->rxgk->key.len);
+
+ /* Pad the ticket so that we can use it directly in XDR */
+ token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4),
+ GFP_KERNEL);
+ if (!token->rxgk->ticket.data)
+ goto nomem_yrxgk;
+ memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len);
+
+ _debug("SCIX: %u", token->security_index);
+ _debug("EXPY: %llx", token->rxgk->endtime);
+ _debug("LIFE: %llx", token->rxgk->lifetime);
+ _debug("BYTE: %llx", token->rxgk->bytelife);
+ _debug("ENC : %u", token->rxgk->enctype);
+ _debug("LEVL: %u", token->rxgk->level);
+ _debug("KLEN: %u", token->rxgk->key.len);
+ _debug("TLEN: %u", token->rxgk->ticket.len);
+ _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data);
+ _debug("TICK: %*phN",
+ min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data);
+
+ /* count the number of tokens attached */
+ prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1);
+
+ /* attach the data */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+
+ _leave(" = 0");
+ return 0;
+
+nomem_yrxgk:
+ kfree(token->rxgk);
+nomem_token:
+ kfree(token);
+nomem:
+ return -ENOMEM;
+reject_token:
+ kfree(token);
+reject:
+ return -EKEYREJECTED;
+expired:
+ kfree(token->rxgk);
+ kfree(token);
+ return -EKEYEXPIRED;
+}
+
/*
* attempt to parse the data as the XDR format
* - the caller guarantees we have more than 7 words
@@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
case RXRPC_SECURITY_RXKAD:
ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen);
+ break;
default:
ret2 = -EPROTONOSUPPORT;
break;
@@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token)
case RXRPC_SECURITY_RXKAD:
kfree(token->kad);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ kfree(token->rxgk->ticket.data);
+ kfree(token->rxgk);
+ break;
default:
pr_err("Unknown token type %x on rxrpc key\n",
token->security_index);
@@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m)
case RXRPC_SECURITY_RXKAD:
seq_puts(m, "ka");
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ seq_puts(m, "ygk");
+ break;
default: /* we have a ticket we can't encode */
seq_printf(m, "%u", token->security_index);
break;
@@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key);
*
* Generate a null RxRPC key that can be used to indicate anonymous security is
* required for a particular domain.
+ *
+ * Return: The new key or a negative error code.
*/
struct key *rxrpc_get_null_key(const char *keyname)
{
@@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key,
toksize += RND(token->kad->ticket_len);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ toksize += 6 * 8 + 2 * 4;
+ if (!token->no_leak_key)
+ toksize += RND(token->rxgk->key.len);
+ toksize += RND(token->rxgk->ticket.len);
+ break;
+
default: /* we have a ticket we can't encode */
pr_err("Unsupported key token type (%u)\n",
token->security_index);
@@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key,
ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
break;
+ case RXRPC_SECURITY_YFS_RXGK:
+ ENCODE64(token->rxgk->begintime);
+ ENCODE64(token->rxgk->endtime);
+ ENCODE64(token->rxgk->level);
+ ENCODE64(token->rxgk->lifetime);
+ ENCODE64(token->rxgk->bytelife);
+ ENCODE64(token->rxgk->enctype);
+ if (token->no_leak_key)
+ ENCODE(0);
+ else
+ ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data);
+ ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data);
+ break;
+
default:
pr_err("Unsupported key token type (%u)\n",
token->security_index);
diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c
new file mode 100644
index 000000000000..05ca9c1faa57
--- /dev/null
+++ b/net/rxrpc/oob.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Out of band message handling (e.g. challenge-response)
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/sched/signal.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+enum rxrpc_oob_command {
+ RXRPC_OOB_CMD_UNSET,
+ RXRPC_OOB_CMD_RESPOND,
+} __mode(byte);
+
+struct rxrpc_oob_params {
+ u64 oob_id; /* ID number of message if reply */
+ s32 abort_code;
+ enum rxrpc_oob_command command;
+ bool have_oob_id:1;
+};
+
+/*
+ * Post an out-of-band message for attention by the socket or kernel service
+ * associated with a reference call.
+ */
+void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_sock *rx;
+ struct sock *sk;
+
+ rcu_read_lock();
+
+ rx = rcu_dereference(call->socket);
+ if (rx) {
+ sk = &rx->sk;
+ spin_lock_irq(&rx->recvmsg_lock);
+
+ if (sk->sk_state < RXRPC_CLOSE) {
+ skb->skb_mstamp_ns = rx->oob_id_counter++;
+ rxrpc_get_skb(skb, rxrpc_skb_get_post_oob);
+ skb_queue_tail(&rx->recvmsg_oobq, skb);
+
+ trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial);
+ if (rx->app_ops)
+ rx->app_ops->notify_oob(sk, skb);
+ }
+
+ spin_unlock_irq(&rx->recvmsg_lock);
+ if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Locate the OOB message to respond to by its ID.
+ */
+static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id)
+{
+ struct rb_node *p;
+ struct sk_buff *skb;
+
+ p = rx->pending_oobq.rb_node;
+ while (p) {
+ skb = rb_entry(p, struct sk_buff, rbnode);
+
+ if (oob_id < skb->skb_mstamp_ns)
+ p = p->rb_left;
+ else if (oob_id > skb->skb_mstamp_ns)
+ p = p->rb_right;
+ else
+ return skb;
+ }
+
+ return NULL;
+}
+
+/*
+ * Add an OOB message into the pending-response set. We always assign the next
+ * value from a 64-bit counter to the oob_id, so just assume we're always going
+ * to be on the right-hand edge of the tree and that the counter won't wrap.
+ * The tree is also given a ref to the message.
+ */
+void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb)
+{
+ struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL;
+
+ while (*pp) {
+ p = *pp;
+ pp = &(*pp)->rb_right;
+ }
+
+ rb_link_node(&skb->rbnode, p, pp);
+ rb_insert_color(&skb->rbnode, &rx->pending_oobq);
+}
+
+/*
+ * Extract control messages from the sendmsg() control buffer.
+ */
+static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p)
+{
+ struct cmsghdr *cmsg;
+ int len;
+
+ if (msg->msg_controllen == 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ len = cmsg->cmsg_len - sizeof(struct cmsghdr);
+ _debug("CMSG %d, %d, %d",
+ cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+ if (cmsg->cmsg_level != SOL_RXRPC)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RXRPC_OOB_ID:
+ if (len != sizeof(p->oob_id) || p->have_oob_id)
+ return -EINVAL;
+ memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id));
+ p->have_oob_id = true;
+ break;
+ case RXRPC_RESPOND:
+ if (p->command != RXRPC_OOB_CMD_UNSET)
+ return -EINVAL;
+ p->command = RXRPC_OOB_CMD_RESPOND;
+ break;
+ case RXRPC_ABORT:
+ if (len != sizeof(p->abort_code) || p->abort_code)
+ return -EINVAL;
+ memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code));
+ if (p->abort_code == 0)
+ return -EINVAL;
+ break;
+ case RXRPC_RESP_RXGK_APPDATA:
+ if (p->command != RXRPC_OOB_CMD_RESPOND)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (p->command) {
+ case RXRPC_OOB_CMD_RESPOND:
+ if (!p->have_oob_id)
+ return -EBADSLT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allow userspace to respond to an OOB using sendmsg().
+ */
+static int rxrpc_respond_to_oob(struct rxrpc_sock *rx,
+ struct rxrpc_oob_params *p,
+ struct msghdr *msg)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = rxrpc_find_pending_oob(rx, p->oob_id);
+ if (skb)
+ rb_erase(&skb->rbnode, &rx->pending_oobq);
+ release_sock(&rx->sk);
+ if (!skb)
+ return -EBADSLT;
+
+ sp = rxrpc_skb(skb);
+
+ switch (p->command) {
+ case RXRPC_OOB_CMD_RESPOND:
+ ret = -EPROTO;
+ if (skb->mark != RXRPC_OOB_CHALLENGE)
+ break;
+ conn = sp->chall.conn;
+ ret = -EOPNOTSUPP;
+ if (!conn->security->sendmsg_respond_to_challenge)
+ break;
+ if (p->abort_code) {
+ rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED,
+ rxrpc_abort_response_sendmsg);
+ ret = 0;
+ } else {
+ ret = conn->security->sendmsg_respond_to_challenge(skb, msg);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ rxrpc_free_skb(skb, rxrpc_skb_put_oob);
+ return ret;
+}
+
+/*
+ * Send an out-of-band message or respond to a received out-of-band message.
+ * - caller gives us the socket lock
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+{
+ struct rxrpc_oob_params p = {};
+ int ret;
+
+ _enter("");
+
+ ret = rxrpc_sendmsg_oob_cmsg(msg, &p);
+ if (ret < 0)
+ goto error_release_sock;
+
+ if (p.have_oob_id)
+ return rxrpc_respond_to_oob(rx, &p, msg);
+
+ release_sock(&rx->sk);
+
+ switch (p.command) {
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+
+error_release_sock:
+ release_sock(&rx->sk);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message
+ * @oob: The message to query
+ * @_peer: Where to return the peer record
+ * @_peer_appdata: The application data attached to a peer record
+ *
+ * Extract useful parameters from an out-of-band message. The source peer
+ * parameters are returned through the argument list and the message type is
+ * returned.
+ *
+ * Return:
+ * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response.
+ */
+enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(oob);
+ enum rxrpc_oob_type type = oob->mark;
+
+ switch (type) {
+ case RXRPC_OOB_CHALLENGE:
+ *_peer = sp->chall.conn->peer;
+ *_peer_appdata = sp->chall.conn->peer->app_data;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ *_peer = NULL;
+ *_peer_appdata = 0;
+ break;
+ }
+
+ return type;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_oob);
+
+/**
+ * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message
+ * @sock: The socket to query
+ * @_type: Where to return the message type
+ *
+ * Dequeue the front OOB message, if there is one, and return it and
+ * its type.
+ *
+ * Return: The sk_buff representing the OOB message or %NULL if the queue was
+ * empty.
+ */
+struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock,
+ enum rxrpc_oob_type *_type)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *oob;
+
+ oob = skb_dequeue(&rx->recvmsg_oobq);
+ if (oob)
+ *_type = oob->mark;
+ return oob;
+}
+EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob);
+
+/**
+ * rxrpc_kernel_free_oob - Free an out-of-band message
+ * @oob: The OOB message to free
+ *
+ * Free an OOB message along with any resources it holds.
+ */
+void rxrpc_kernel_free_oob(struct sk_buff *oob)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(oob);
+
+ switch (oob->mark) {
+ case RXRPC_OOB_CHALLENGE:
+ rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob);
+ break;
+ }
+
+ rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob);
+}
+EXPORT_SYMBOL(rxrpc_kernel_free_oob);
+
+/**
+ * rxrpc_kernel_query_challenge - Query the parameters of a challenge
+ * @challenge: The challenge to query
+ * @_peer: Where to return the peer record
+ * @_peer_appdata: The application data attached to a peer record
+ * @_service_id: Where to return the connection service ID
+ * @_security_index: Where to return the connection security index
+ *
+ * Extract useful parameters from a CHALLENGE message.
+ */
+void rxrpc_kernel_query_challenge(struct sk_buff *challenge,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata,
+ u16 *_service_id, u8 *_security_index)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ *_peer = sp->chall.conn->peer;
+ *_peer_appdata = sp->chall.conn->peer->app_data;
+ *_service_id = sp->hdr.serviceId;
+ *_security_index = sp->hdr.securityIndex;
+}
+EXPORT_SYMBOL(rxrpc_kernel_query_challenge);
+
+/**
+ * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge
+ * @challenge: The challenge to be rejected
+ * @abort_code: The abort code to stick into the ABORT packet
+ * @error: Local error value
+ * @why: Indication as to why.
+ *
+ * Allow a kernel service to reject a challenge by aborting the connection if
+ * it's still in an abortable state. The error is returned so this function
+ * can be used with a return statement.
+ *
+ * Return: The %error parameter.
+ */
+int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code,
+ int error, enum rxrpc_abort_reason why)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why);
+
+ rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why);
+ return error;
+}
+EXPORT_SYMBOL(rxrpc_kernel_reject_challenge);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 95905b85a8d7..0af19bcdc80a 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -18,7 +18,7 @@
extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
-static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
+ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
{
struct sockaddr *sa = msg->msg_name;
struct sock *sk = socket->sk;
@@ -916,3 +916,61 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
peer->last_tx_at = ktime_get_seconds();
_leave("");
}
+
+/*
+ * Send a RESPONSE message.
+ */
+void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(response);
+ struct scatterlist sg[16];
+ struct bio_vec bvec[16];
+ struct msghdr msg;
+ size_t len = sp->resp.len;
+ __be32 wserial;
+ u32 serial = 0;
+ int ret, nr_sg;
+
+ _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial);
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ ret = skb_to_sgvec(response, sg, 0, len);
+ if (ret < 0)
+ goto fail;
+ nr_sg = ret;
+
+ for (int i = 0; i < nr_sg; i++)
+ bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset);
+
+ iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len);
+
+ msg.msg_name = &conn->peer->srx.transport;
+ msg.msg_namelen = conn->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_SPLICE_PAGES;
+
+ serial = rxrpc_get_next_serials(conn, 1);
+ wserial = htonl(serial);
+
+ trace_rxrpc_tx_response(conn, serial, sp);
+
+ ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial),
+ &wserial, sizeof(wserial));
+ if (ret < 0)
+ goto fail;
+
+ rxrpc_local_dont_fragment(conn->local, false);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+ if (ret < 0)
+ goto fail;
+
+ conn->peer->last_tx_at = ktime_get_seconds();
+ return;
+
+fail:
+ trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
+ rxrpc_tx_point_response);
+ kleave(" = %d", ret);
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 71b6e07bf161..e2f35e6c04d6 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -475,6 +475,8 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
* @call: The call to query
*
* Get a record for the remote peer in a call.
+ *
+ * Return: The call's peer record.
*/
struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call)
{
@@ -486,7 +488,9 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
* rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
* @peer: The peer to query
*
- * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples.
+ * Get the call's peer smoothed RTT.
+ *
+ * Return: The RTT in uS or %UINT_MAX if we have no samples.
*/
unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
{
@@ -499,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
* @peer: The peer to query
*
* Get a pointer to the address from a peer record. The caller is responsible
- * for making sure that the address is not deallocated.
+ * for making sure that the address is not deallocated. A fake address will be
+ * substituted if %peer in NULL.
+ *
+ * Return: The rxrpc address record or a fake record.
*/
const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer)
{
@@ -512,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx);
* @peer: The peer to query
*
* Get a pointer to the transport address from a peer record. The caller is
- * responsible for making sure that the address is not deallocated.
+ * responsible for making sure that the address is not deallocated. A fake
+ * address will be substituted if %peer in NULL.
+ *
+ * Return: The transport address record or a fake record.
*/
const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer)
{
@@ -527,7 +537,9 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_addr);
* @app_data: The data to set
*
* Set the app-specific data on a peer. AF_RXRPC makes no effort to retain
- * anything the data might refer to. The previous app_data is returned.
+ * anything the data might refer to.
+ *
+ * Return: The previous app_data.
*/
unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data)
{
@@ -540,6 +552,8 @@ EXPORT_SYMBOL(rxrpc_kernel_set_peer_data);
* @peer: The peer to query
*
* Retrieve the app-specific data from a peer.
+ *
+ * Return: The peer's app data.
*/
unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer)
{
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 42f70e4636f8..f8bfec12bc7e 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -181,4 +181,24 @@ struct rxkad_response {
__be32 ticket_len; /* Kerberos ticket length */
} __packed;
+/*
+ * GSSAPI security type-4 and type-6 data header.
+ */
+struct rxgk_header {
+ __be32 epoch;
+ __be32 cid;
+ __be32 call_number;
+ __be32 seq;
+ __be32 sec_index;
+ __be32 data_len;
+} __packed;
+
+/*
+ * GSSAPI security type-4 and type-6 response packet header.
+ */
+struct rxgk_response {
+ __be64 start_time;
+ __be32 token_len;
+} __packed;
+
#endif /* _LINUX_RXRPC_PACKET_H */
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 32cd5f1d541d..86a27fb55a1c 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -155,6 +155,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb)
}
/*
+ * Transcribe a call's user ID to a control message.
+ */
+static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg,
+ int flags)
+{
+ if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags))
+ return 0;
+
+ if (flags & MSG_CMSG_COMPAT) {
+ unsigned int id32 = call->user_call_ID;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned int), &id32);
+ } else {
+ unsigned long idl = call->user_call_ID;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ sizeof(unsigned long), &idl);
+ }
+}
+
+/*
+ * Deal with a CHALLENGE packet.
+ */
+static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg,
+ struct sk_buff *challenge, unsigned int flags)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+ struct rxrpc_connection *conn = sp->chall.conn;
+
+ return conn->security->challenge_to_recvmsg(conn, challenge, msg);
+}
+
+/*
+ * Process OOB packets. Called with the socket locked.
+ */
+static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg,
+ unsigned int flags)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *skb;
+ bool need_response = false;
+ int ret;
+
+ skb = skb_peek(&rx->recvmsg_oobq);
+ if (!skb)
+ return -EAGAIN;
+ rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg);
+
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64),
+ &skb->skb_mstamp_ns);
+ if (ret < 0)
+ return ret;
+
+ switch ((enum rxrpc_oob_type)skb->mark) {
+ case RXRPC_OOB_CHALLENGE:
+ need_response = true;
+ ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags);
+ break;
+ default:
+ WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n",
+ skb->mark);
+ ret = -EIO;
+ break;
+ }
+
+ if (!(flags & MSG_PEEK))
+ skb_unlink(skb, &rx->recvmsg_oobq);
+ if (need_response)
+ rxrpc_add_pending_oob(rx, skb);
+ else
+ rxrpc_free_skb(skb, rxrpc_skb_put_oob);
+ return ret;
+}
+
+/*
* Deliver messages to a call. This keeps processing packets until the buffer
* is filled and we find either more DATA (returns 0) or the end of the DATA
* (returns 1). If more packets are required, it returns -EAGAIN and if the
@@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
size_t len, int flags, size_t *_offset)
{
struct rxrpc_skb_priv *sp;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
struct sk_buff *skb;
rxrpc_seq_t seq = 0;
size_t remain;
@@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq,
sp->offset, sp->len, ret2);
if (ret2 < 0) {
- kdebug("verify = %d", ret2);
ret = ret2;
goto out;
}
@@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (!(flags & MSG_PEEK))
rxrpc_rotate_rx_window(call);
+
+ if (!rx->app_ops &&
+ !skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
+ trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq,
+ rx_pkt_offset, rx_pkt_len, ret);
+ break;
+ }
}
out:
@@ -262,6 +345,7 @@ out:
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
}
+
done:
trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq,
rx_pkt_offset, rx_pkt_len, ret);
@@ -301,6 +385,7 @@ try_again:
/* Return immediately if a client socket has no outstanding calls */
if (RB_EMPTY_ROOT(&rx->calls) &&
list_empty(&rx->recvmsg_q) &&
+ skb_queue_empty_lockless(&rx->recvmsg_oobq) &&
rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
release_sock(&rx->sk);
return -EAGAIN;
@@ -322,7 +407,8 @@ try_again:
if (ret)
goto wait_error;
- if (list_empty(&rx->recvmsg_q)) {
+ if (list_empty(&rx->recvmsg_q) &&
+ skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
if (signal_pending(current))
goto wait_interrupted;
trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0);
@@ -332,6 +418,15 @@ try_again:
goto try_again;
}
+ /* Deal with OOB messages before we consider getting normal data. */
+ if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) {
+ ret = rxrpc_recvmsg_oob(sock, msg, flags);
+ release_sock(&rx->sk);
+ if (ret == -EAGAIN)
+ goto try_again;
+ goto error_no_call;
+ }
+
/* Find the next call and dequeue it if we're not just peeking. If we
* do dequeue it, that comes with a ref that we will need to release.
* We also want to weed out calls that got requeued whilst we were
@@ -342,7 +437,8 @@ try_again:
call = list_entry(l, struct rxrpc_call, recvmsg_link);
if (!rxrpc_call_is_complete(call) &&
- skb_queue_empty(&call->recvmsg_queue)) {
+ skb_queue_empty(&call->recvmsg_queue) &&
+ skb_queue_empty(&rx->recvmsg_oobq)) {
list_del_init(&call->recvmsg_link);
spin_unlock_irq(&rx->recvmsg_lock);
release_sock(&rx->sk);
@@ -377,21 +473,9 @@ try_again:
if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
BUG();
- if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
- if (flags & MSG_CMSG_COMPAT) {
- unsigned int id32 = call->user_call_ID;
-
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- sizeof(unsigned int), &id32);
- } else {
- unsigned long idl = call->user_call_ID;
-
- ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
- sizeof(unsigned long), &idl);
- }
- if (ret < 0)
- goto error_unlock_call;
- }
+ ret = rxrpc_recvmsg_user_id(call, msg, flags);
+ if (ret < 0)
+ goto error_unlock_call;
if (msg->msg_name && call->peer) {
size_t len = sizeof(call->dest_srx);
@@ -477,14 +561,14 @@ wait_error:
* @_service: Where to store the actual service ID (may be upgraded)
*
* Allow a kernel service to receive data and pick up information about the
- * state of a call. Returns 0 if got what was asked for and there's more
- * available, 1 if we got what was asked for and we're at the end of the data
- * and -EAGAIN if we need more data.
+ * state of a call. Note that *@_abort should also be initialised to %0.
*
- * Note that we may return -EAGAIN to drain empty packets at the end of the
- * data, even if we've already copied over the requested data.
+ * Note that we may return %-EAGAIN to drain empty packets at the end
+ * of the data, even if we've already copied over the requested data.
*
- * *_abort should also be initialised to 0.
+ * Return: %0 if got what was asked for and there's more available, %1
+ * if we got what was asked for and we're at the end of the data and
+ * %-EAGAIN if we need more data.
*/
int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
struct iov_iter *iter, size_t *_len,
diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c
new file mode 100644
index 000000000000..1e19c605bcc8
--- /dev/null
+++ b/net/rxrpc/rxgk.c
@@ -0,0 +1,1371 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* GSSAPI-based RxRPC security
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/key-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+/*
+ * Parse the information from a server key
+ */
+static int rxgk_preparse_server_key(struct key_preparsed_payload *prep)
+{
+ const struct krb5_enctype *krb5;
+ struct krb5_buffer *server_key = (void *)&prep->payload.data[2];
+ unsigned int service, sec_class, kvno, enctype;
+ int n = 0;
+
+ _enter("%zu", prep->datalen);
+
+ if (sscanf(prep->orig_description, "%u:%u:%u:%u%n",
+ &service, &sec_class, &kvno, &enctype, &n) != 4)
+ return -EINVAL;
+
+ if (prep->orig_description[n])
+ return -EINVAL;
+
+ krb5 = crypto_krb5_find_enctype(enctype);
+ if (!krb5)
+ return -ENOPKG;
+
+ prep->payload.data[0] = (struct krb5_enctype *)krb5;
+
+ if (prep->datalen != krb5->key_len)
+ return -EKEYREJECTED;
+
+ server_key->len = prep->datalen;
+ server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+ if (!server_key->data)
+ return -ENOMEM;
+
+ _leave(" = 0");
+ return 0;
+}
+
+static void rxgk_free_server_key(union key_payload *payload)
+{
+ struct krb5_buffer *server_key = (void *)&payload->data[2];
+
+ kfree_sensitive(server_key->data);
+}
+
+static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep)
+{
+ rxgk_free_server_key(&prep->payload);
+}
+
+static void rxgk_destroy_server_key(struct key *key)
+{
+ rxgk_free_server_key(&key->payload);
+}
+
+static void rxgk_describe_server_key(const struct key *key, struct seq_file *m)
+{
+ const struct krb5_enctype *krb5 = key->payload.data[0];
+
+ if (krb5)
+ seq_printf(m, ": %s", krb5->name);
+}
+
+/*
+ * Handle rekeying the connection when we see our limits overrun or when the
+ * far side decided to rekey.
+ *
+ * Returns a ref on the context if successful or -ESTALE if the key is out of
+ * date.
+ */
+static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn,
+ const u16 *specific_key_number)
+{
+ struct rxgk_context *gk, *dead = NULL;
+ unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1;
+ bool crank = false;
+
+ _enter("%d", specific_key_number ? *specific_key_number : -1);
+
+ mutex_lock(&conn->security_lock);
+
+ current_key = conn->rxgk.key_number;
+ if (!specific_key_number) {
+ key_number = current_key;
+ } else {
+ if (*specific_key_number == (u16)current_key)
+ key_number = current_key;
+ else if (*specific_key_number == (u16)(current_key - 1))
+ key_number = current_key - 1;
+ else if (*specific_key_number == (u16)(current_key + 1))
+ goto crank_window;
+ else
+ goto bad_key;
+ }
+
+ gk = conn->rxgk.keys[key_number & mask];
+ if (!gk)
+ goto generate_key;
+ if (!specific_key_number &&
+ test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags))
+ goto crank_window;
+
+grab:
+ refcount_inc(&gk->usage);
+ mutex_unlock(&conn->security_lock);
+ rxgk_put(dead);
+ return gk;
+
+crank_window:
+ trace_rxrpc_rxgk_rekey(conn, current_key,
+ specific_key_number ? *specific_key_number : -1);
+ if (current_key == UINT_MAX)
+ goto bad_key;
+ if (current_key + 1 == UINT_MAX)
+ set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
+
+ key_number = current_key + 1;
+ if (WARN_ON(conn->rxgk.keys[key_number & mask]))
+ goto bad_key;
+ crank = true;
+
+generate_key:
+ gk = conn->rxgk.keys[current_key & mask];
+ gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS);
+ if (IS_ERR(gk)) {
+ mutex_unlock(&conn->security_lock);
+ return gk;
+ }
+
+ write_lock(&conn->security_use_lock);
+ if (crank) {
+ current_key++;
+ conn->rxgk.key_number = current_key;
+ dead = conn->rxgk.keys[(current_key - 2) & mask];
+ conn->rxgk.keys[(current_key - 2) & mask] = NULL;
+ }
+ conn->rxgk.keys[current_key & mask] = gk;
+ write_unlock(&conn->security_use_lock);
+ goto grab;
+
+bad_key:
+ mutex_unlock(&conn->security_lock);
+ return ERR_PTR(-ESTALE);
+}
+
+/*
+ * Get the specified keying context.
+ *
+ * Returns a ref on the context if successful or -ESTALE if the key is out of
+ * date.
+ */
+static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn,
+ const u16 *specific_key_number)
+{
+ struct rxgk_context *gk;
+ unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1;
+
+ _enter("{%u},%d",
+ conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1);
+
+ read_lock(&conn->security_use_lock);
+
+ current_key = conn->rxgk.key_number;
+ if (!specific_key_number) {
+ key_number = current_key;
+ } else {
+ /* Only the bottom 16 bits of the key number are exposed in the
+ * header, so we try and keep the upper 16 bits in step. The
+ * whole 32 bits are used to generate the TK.
+ */
+ if (*specific_key_number == (u16)current_key)
+ key_number = current_key;
+ else if (*specific_key_number == (u16)(current_key - 1))
+ key_number = current_key - 1;
+ else if (*specific_key_number == (u16)(current_key + 1))
+ goto rekey;
+ else
+ goto bad_key;
+ }
+
+ gk = conn->rxgk.keys[key_number & mask];
+ if (!gk)
+ goto slow_path;
+ if (!specific_key_number &&
+ key_number < UINT_MAX) {
+ if (time_after(jiffies, gk->expiry) ||
+ gk->bytes_remaining < 0) {
+ set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags);
+ goto slow_path;
+ }
+
+ if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags))
+ goto slow_path;
+ }
+
+ refcount_inc(&gk->usage);
+ read_unlock(&conn->security_use_lock);
+ return gk;
+
+rekey:
+ _debug("rekey");
+ if (current_key == UINT_MAX)
+ goto bad_key;
+ gk = conn->rxgk.keys[current_key & mask];
+ if (gk)
+ set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags);
+slow_path:
+ read_unlock(&conn->security_use_lock);
+ return rxgk_rekey(conn, specific_key_number);
+bad_key:
+ read_unlock(&conn->security_use_lock);
+ return ERR_PTR(-ESTALE);
+}
+
+/*
+ * initialise connection security
+ */
+static int rxgk_init_connection_security(struct rxrpc_connection *conn,
+ struct rxrpc_key_token *token)
+{
+ struct rxgk_context *gk;
+ int ret;
+
+ _enter("{%d,%u},{%x}",
+ conn->debug_id, conn->rxgk.key_number, key_serial(conn->key));
+
+ conn->security_ix = token->security_index;
+ conn->security_level = token->rxgk->level;
+
+ if (rxrpc_conn_is_client(conn)) {
+ conn->rxgk.start_time = ktime_get();
+ do_div(conn->rxgk.start_time, 100);
+ }
+
+ gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number,
+ GFP_NOFS);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk);
+ conn->rxgk.enctype = gk->krb5->etype;
+ conn->rxgk.keys[gk->key_number & 3] = gk;
+
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ case RXRPC_SECURITY_AUTH:
+ case RXRPC_SECURITY_ENCRYPT:
+ break;
+ default:
+ ret = -EKEYREJECTED;
+ goto error;
+ }
+
+ ret = 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Clean up the crypto on a call.
+ */
+static void rxgk_free_call_crypto(struct rxrpc_call *call)
+{
+}
+
+/*
+ * Work out how much data we can put in a packet.
+ */
+static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
+{
+ enum krb5_crypto_mode mode;
+ struct rxgk_context *gk;
+ struct rxrpc_txbuf *txb;
+ size_t shdr, alloc, limit, part, offset, gap;
+
+ switch (call->conn->security_level) {
+ default:
+ alloc = umin(remain, RXRPC_JUMBO_DATALEN);
+ return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp);
+ case RXRPC_SECURITY_AUTH:
+ shdr = 0;
+ mode = KRB5_CHECKSUM_MODE;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ shdr = sizeof(struct rxgk_header);
+ mode = KRB5_ENCRYPT_MODE;
+ break;
+ }
+
+ gk = rxgk_get_key(call->conn, NULL);
+ if (IS_ERR(gk))
+ return NULL;
+
+ /* Work out the maximum amount of data that will fit. */
+ alloc = RXRPC_JUMBO_DATALEN;
+ limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset);
+
+ if (remain < limit - shdr) {
+ part = remain;
+ alloc = crypto_krb5_how_much_buffer(gk->krb5, mode,
+ shdr + part, &offset);
+ gap = 0;
+ } else {
+ part = limit - shdr;
+ gap = RXRPC_JUMBO_DATALEN - alloc;
+ alloc = RXRPC_JUMBO_DATALEN;
+ }
+
+ rxgk_put(gk);
+
+ txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp);
+ if (!txb)
+ return NULL;
+
+ txb->crypto_header = offset;
+ txb->sec_header = shdr;
+ txb->offset += offset + shdr;
+ txb->space = part;
+
+ /* Clear excess space in the packet */
+ if (gap)
+ memset(txb->data + alloc - gap, 0, gap);
+ return txb;
+}
+
+/*
+ * Integrity mode (sign a packet - level 1 security)
+ */
+static int rxgk_secure_packet_integrity(const struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct rxrpc_txbuf *txb)
+{
+ struct rxgk_header *hdr;
+ struct scatterlist sg[1];
+ struct krb5_buffer metadata;
+ int ret = -ENOMEM;
+
+ _enter("");
+
+ hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
+ if (!hdr)
+ goto error_gk;
+
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(txb->seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(txb->len);
+ metadata.len = sizeof(*hdr);
+ metadata.data = hdr;
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], txb->data, txb->alloc_size);
+
+ ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata,
+ sg, 1, txb->alloc_size,
+ txb->crypto_header, txb->sec_header + txb->len);
+ if (ret >= 0) {
+ txb->pkt_len = ret;
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
+ gk->bytes_remaining -= ret;
+ }
+ kfree(hdr);
+error_gk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * wholly encrypt a packet (level 2 security)
+ */
+static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct rxrpc_txbuf *txb)
+{
+ struct rxgk_header *hdr;
+ struct scatterlist sg[1];
+ int ret;
+
+ _enter("%x", txb->len);
+
+ /* Insert the header into the buffer. */
+ hdr = txb->data + txb->crypto_header;
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(txb->seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(txb->len);
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], txb->data, txb->alloc_size);
+
+ ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc,
+ sg, 1, txb->alloc_size,
+ txb->crypto_header, txb->sec_header + txb->len,
+ false);
+ if (ret >= 0) {
+ txb->pkt_len = ret;
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
+ gk->bytes_remaining -= ret;
+ }
+
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * checksum an RxRPC packet header
+ */
+static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+{
+ struct rxgk_context *gk;
+ int ret;
+
+ _enter("{%d{%x}},{#%u},%u,",
+ call->debug_id, key_serial(call->conn->key), txb->seq, txb->len);
+
+ gk = rxgk_get_key(call->conn, NULL);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk);
+
+ ret = key_validate(call->conn->key);
+ if (ret < 0) {
+ rxgk_put(gk);
+ return ret;
+ }
+
+ call->security_enctype = gk->krb5->etype;
+ txb->cksum = htons(gk->key_number);
+
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ rxgk_put(gk);
+ txb->pkt_len = txb->len;
+ return 0;
+ case RXRPC_SECURITY_AUTH:
+ return rxgk_secure_packet_integrity(call, gk, txb);
+ case RXRPC_SECURITY_ENCRYPT:
+ return rxgk_secure_packet_encrypted(call, gk, txb);
+ default:
+ rxgk_put(gk);
+ return -EPERM;
+ }
+}
+
+/*
+ * Integrity mode (check the signature on a packet - level 1 security)
+ */
+static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_header *hdr;
+ struct krb5_buffer metadata;
+ unsigned int offset = sp->offset, len = sp->len;
+ size_t data_offset = 0, data_len = len;
+ u32 ac;
+ int ret = -ENOMEM;
+
+ _enter("");
+
+ crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE,
+ &data_offset, &data_len);
+
+ hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
+ if (!hdr)
+ goto put_gk;
+
+ hdr->epoch = htonl(call->conn->proto.epoch);
+ hdr->cid = htonl(call->cid);
+ hdr->call_number = htonl(call->call_id);
+ hdr->seq = htonl(sp->hdr.seq);
+ hdr->sec_index = htonl(call->security_ix);
+ hdr->data_len = htonl(data_len);
+
+ metadata.len = sizeof(*hdr);
+ metadata.data = hdr;
+ ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata,
+ skb, &offset, &len, &ac);
+ kfree(hdr);
+ if (ret == -EPROTO) {
+ rxrpc_abort_eproto(call, skb, ac,
+ rxgk_abort_1_verify_mic_eproto);
+ } else {
+ sp->offset = offset;
+ sp->len = len;
+ }
+
+put_gk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Decrypt an encrypted packet (level 2 security).
+ */
+static int rxgk_verify_packet_encrypted(struct rxrpc_call *call,
+ struct rxgk_context *gk,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_header hdr;
+ unsigned int offset = sp->offset, len = sp->len;
+ int ret;
+ u32 ac;
+
+ _enter("");
+
+ ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac);
+ if (ret == -EPROTO)
+ rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto);
+ if (ret < 0)
+ goto error;
+
+ if (len < sizeof(hdr)) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_2_short_header);
+ goto error;
+ }
+
+ /* Extract the header from the skb */
+ ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr));
+ if (ret < 0) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
+ rxgk_abort_2_short_encdata);
+ goto error;
+ }
+ offset += sizeof(hdr);
+ len -= sizeof(hdr);
+
+ if (ntohl(hdr.epoch) != call->conn->proto.epoch ||
+ ntohl(hdr.cid) != call->cid ||
+ ntohl(hdr.call_number) != call->call_id ||
+ ntohl(hdr.seq) != sp->hdr.seq ||
+ ntohl(hdr.sec_index) != call->security_ix ||
+ ntohl(hdr.data_len) > len) {
+ ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON,
+ rxgk_abort_2_short_data);
+ goto error;
+ }
+
+ sp->offset = offset;
+ sp->len = ntohl(hdr.data_len);
+ ret = 0;
+error:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Verify the security on a received packet or subpacket (if part of a
+ * jumbo packet).
+ */
+static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_context *gk;
+ u16 key_number = sp->hdr.cksum;
+
+ _enter("{%d{%x}},{#%u}",
+ call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
+
+ gk = rxgk_get_key(call->conn, &key_number);
+ if (IS_ERR(gk)) {
+ switch (PTR_ERR(gk)) {
+ case -ESTALE:
+ return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO,
+ rxgk_abort_bad_key_number);
+ default:
+ return PTR_ERR(gk);
+ }
+ }
+
+ call->security_enctype = gk->krb5->etype;
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ rxgk_put(gk);
+ return 0;
+ case RXRPC_SECURITY_AUTH:
+ return rxgk_verify_packet_integrity(call, gk, skb);
+ case RXRPC_SECURITY_ENCRYPT:
+ return rxgk_verify_packet_encrypted(call, gk, skb);
+ default:
+ rxgk_put(gk);
+ return -ENOANO;
+ }
+}
+
+/*
+ * Allocate memory to hold a challenge or a response packet. We're not running
+ * in the io_thread, so we can't use ->tx_alloc.
+ */
+static struct page *rxgk_alloc_packet(size_t total_len)
+{
+ gfp_t gfp = GFP_NOFS;
+ int order;
+
+ order = get_order(total_len);
+ if (order > 0)
+ gfp |= __GFP_COMP;
+ return alloc_pages(gfp, order);
+}
+
+/*
+ * Issue a challenge.
+ */
+static int rxgk_issue_challenge(struct rxrpc_connection *conn)
+{
+ struct rxrpc_wire_header *whdr;
+ struct bio_vec bvec[1];
+ struct msghdr msg;
+ struct page *page;
+ size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce);
+ u32 serial;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce));
+
+ /* We can't use conn->tx_alloc without a lock */
+ page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce));
+ if (!page)
+ return -ENOMEM;
+
+ bvec_set_page(&bvec[0], page, len, 0);
+ iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len);
+
+ msg.msg_name = &conn->peer->srx.transport;
+ msg.msg_namelen = conn->peer->srx.transport_len;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_SPLICE_PAGES;
+
+ whdr = page_address(page);
+ whdr->epoch = htonl(conn->proto.epoch);
+ whdr->cid = htonl(conn->proto.cid);
+ whdr->callNumber = 0;
+ whdr->seq = 0;
+ whdr->type = RXRPC_PACKET_TYPE_CHALLENGE;
+ whdr->flags = conn->out_clientflag;
+ whdr->userStatus = 0;
+ whdr->securityIndex = conn->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(conn->service_id);
+
+ memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce));
+
+ serial = rxrpc_get_next_serials(conn, 1);
+ whdr->serial = htonl(serial);
+
+ trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
+ if (ret > 0)
+ conn->peer->last_tx_at = ktime_get_seconds();
+ __free_page(page);
+
+ if (ret < 0) {
+ trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
+ rxrpc_tx_point_rxgk_challenge);
+ return -EAGAIN;
+ }
+
+ trace_rxrpc_tx_packet(conn->debug_id, whdr,
+ rxrpc_tx_point_rxgk_challenge);
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Validate a challenge packet.
+ */
+static bool rxgk_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ u8 nonce[20];
+
+ if (!conn->key) {
+ rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ rxgk_abort_chall_no_key);
+ return false;
+ }
+
+ if (key_validate(conn->key) < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO,
+ rxgk_abort_chall_key_expired);
+ return false;
+ }
+
+ if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
+ nonce, sizeof(nonce)) < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_chall_short);
+ return false;
+ }
+
+ trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0);
+ return true;
+}
+
+/**
+ * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters
+ * @challenge: The challenge packet to query
+ *
+ * Return: The Kerberos 5 encoding type for the challenged connection.
+ */
+u32 rxgk_kernel_query_challenge(struct sk_buff *challenge)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(challenge);
+
+ return sp->chall.conn->rxgk.enctype;
+}
+EXPORT_SYMBOL(rxgk_kernel_query_challenge);
+
+/*
+ * Fill out the control message to pass to userspace to inform about the
+ * challenge.
+ */
+static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ struct rxgk_challenge chall;
+
+ chall.base.service_id = conn->service_id;
+ chall.base.security_index = conn->security_ix;
+ chall.enctype = conn->rxgk.enctype;
+
+ return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall);
+}
+
+/*
+ * Insert the requisite amount of XDR padding for the length given.
+ */
+static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset)
+{
+ __be32 zero = 0;
+ size_t pad = xdr_round_up(len) - len;
+ int ret;
+
+ if (!pad)
+ return 0;
+
+ ret = skb_store_bits(response, offset, &zero, pad);
+ if (ret < 0)
+ return ret;
+ return pad;
+}
+
+/*
+ * Insert the header into the response.
+ */
+static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ struct sk_buff *response,
+ size_t offset)
+{
+ struct rxrpc_skb_priv *rsp = rxrpc_skb(response);
+
+ struct {
+ struct rxrpc_wire_header whdr;
+ __be32 start_time_msw;
+ __be32 start_time_lsw;
+ __be32 ticket_len;
+ } h;
+ int ret;
+
+ rsp->resp.kvno = gk->key_number;
+ rsp->resp.version = gk->krb5->etype;
+
+ h.whdr.epoch = htonl(conn->proto.epoch);
+ h.whdr.cid = htonl(conn->proto.cid);
+ h.whdr.callNumber = 0;
+ h.whdr.serial = 0;
+ h.whdr.seq = 0;
+ h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
+ h.whdr.flags = conn->out_clientflag;
+ h.whdr.userStatus = 0;
+ h.whdr.securityIndex = conn->security_ix;
+ h.whdr.cksum = htons(gk->key_number);
+ h.whdr.serviceId = htons(conn->service_id);
+ h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time));
+ h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time));
+ h.ticket_len = htonl(gk->key->ticket.len);
+
+ ret = skb_store_bits(response, offset, &h, sizeof(h));
+ return ret < 0 ? ret : sizeof(h);
+}
+
+/*
+ * Construct the authenticator to go in the response packet
+ *
+ * struct RXGK_Authenticator {
+ * opaque nonce[20];
+ * opaque appdata<>;
+ * RXGK_Level level;
+ * unsigned int epoch;
+ * unsigned int cid;
+ * unsigned int call_numbers<>;
+ * };
+ */
+static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ const struct krb5_buffer *appdata,
+ struct sk_buff *response,
+ size_t offset)
+{
+ struct {
+ u8 nonce[20];
+ __be32 appdata_len;
+ } a;
+ struct {
+ __be32 level;
+ __be32 epoch;
+ __be32 cid;
+ __be32 call_numbers_count;
+ __be32 call_numbers[4];
+ } b;
+ int ret;
+
+ ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header),
+ a.nonce, sizeof(a.nonce));
+ if (ret < 0)
+ return -EPROTO;
+
+ a.appdata_len = htonl(appdata->len);
+
+ ret = skb_store_bits(response, offset, &a, sizeof(a));
+ if (ret < 0)
+ return ret;
+ offset += sizeof(a);
+
+ if (appdata->len) {
+ ret = skb_store_bits(response, offset, appdata->data, appdata->len);
+ if (ret < 0)
+ return ret;
+ offset += appdata->len;
+
+ ret = rxgk_pad_out(response, appdata->len, offset);
+ if (ret < 0)
+ return ret;
+ offset += ret;
+ }
+
+ b.level = htonl(conn->security_level);
+ b.epoch = htonl(conn->proto.epoch);
+ b.cid = htonl(conn->proto.cid);
+ b.call_numbers_count = htonl(4);
+ b.call_numbers[0] = htonl(conn->channels[0].call_counter);
+ b.call_numbers[1] = htonl(conn->channels[1].call_counter);
+ b.call_numbers[2] = htonl(conn->channels[2].call_counter);
+ b.call_numbers[3] = htonl(conn->channels[3].call_counter);
+
+ ret = skb_store_bits(response, offset, &b, sizeof(b));
+ if (ret < 0)
+ return ret;
+ return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b);
+}
+
+static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ struct sk_buff *response,
+ size_t offset,
+ size_t alloc_len,
+ size_t auth_offset,
+ size_t auth_len)
+{
+ struct scatterlist sg[16];
+ int nr_sg;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(response, sg, offset, alloc_len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+ return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len,
+ auth_offset, auth_len, false);
+}
+
+/*
+ * Construct the response.
+ *
+ * struct RXGK_Response {
+ * rxgkTime start_time;
+ * RXGK_Data token;
+ * opaque authenticator<RXGK_MAXAUTHENTICATOR>
+ * };
+ */
+static int rxgk_construct_response(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ struct rxrpc_skb_priv *csp, *rsp;
+ struct rxgk_context *gk;
+ struct sk_buff *response;
+ size_t len, auth_len, authx_len, offset, auth_offset, authx_offset;
+ __be32 tmp;
+ int ret;
+
+ gk = rxgk_get_key(conn, NULL);
+ if (IS_ERR(gk))
+ return PTR_ERR(gk);
+
+ auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4;
+ authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE,
+ auth_len, &auth_offset);
+ len = sizeof(struct rxrpc_wire_header) +
+ 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len);
+
+ response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS);
+ if (!response)
+ goto error;
+ rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk);
+ response->len = len;
+ response->data_len = len;
+
+ ret = rxgk_insert_response_header(conn, gk, response, 0);
+ if (ret < 0)
+ goto error;
+ offset = ret;
+
+ ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len);
+ if (ret < 0)
+ goto error;
+ offset += gk->key->ticket.len;
+ ret = rxgk_pad_out(response, gk->key->ticket.len, offset);
+ if (ret < 0)
+ goto error;
+
+ authx_offset = offset + ret + 4; /* Leave a gap for the length. */
+
+ ret = rxgk_construct_authenticator(conn, challenge, appdata, response,
+ authx_offset + auth_offset);
+ if (ret < 0)
+ goto error;
+ auth_len = ret;
+
+ ret = rxgk_encrypt_authenticator(conn, gk, response,
+ authx_offset, authx_len,
+ auth_offset, auth_len);
+ if (ret < 0)
+ goto error;
+ authx_len = ret;
+
+ tmp = htonl(authx_len);
+ ret = skb_store_bits(response, authx_offset - 4, &tmp, 4);
+ if (ret < 0)
+ goto error;
+
+ ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len);
+ if (ret < 0)
+ goto error;
+ len = authx_offset + authx_len + ret;
+
+ if (len != response->len) {
+ response->len = len;
+ response->data_len = len;
+ }
+
+ csp = rxrpc_skb(challenge);
+ rsp = rxrpc_skb(response);
+ rsp->resp.len = len;
+ rsp->resp.challenge_serial = csp->hdr.serial;
+ rxrpc_post_response(conn, response);
+ response = NULL;
+ ret = 0;
+
+error:
+ rxrpc_free_skb(response, rxrpc_skb_put_response);
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Respond to a challenge packet.
+ */
+static int rxgk_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ if (key_validate(conn->key) < 0)
+ return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO,
+ rxgk_abort_chall_key_expired);
+
+ return rxgk_construct_response(conn, challenge, appdata);
+}
+
+static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn,
+ struct sk_buff *challenge)
+{
+ struct krb5_buffer appdata = {};
+
+ return rxgk_respond_to_challenge(conn, challenge, &appdata);
+}
+
+/**
+ * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata
+ * @challenge: The challenge to respond to
+ * @appdata: The application data to include in the RESPONSE authenticator
+ *
+ * Allow a kernel application to respond to a CHALLENGE with application data
+ * to be included in the RxGK RESPONSE Authenticator.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
+ */
+int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge,
+ struct krb5_buffer *appdata)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+
+ return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata);
+}
+EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge);
+
+/*
+ * Parse sendmsg() control message and respond to challenge. We need to see if
+ * there's an appdata to fish out.
+ */
+static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ struct krb5_buffer appdata = {};
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (cmsg->cmsg_level != SOL_RXRPC ||
+ cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA)
+ continue;
+ if (appdata.data)
+ return -EINVAL;
+ appdata.data = CMSG_DATA(cmsg);
+ appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr);
+ }
+
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+}
+
+/*
+ * Verify the authenticator.
+ *
+ * struct RXGK_Authenticator {
+ * opaque nonce[20];
+ * opaque appdata<>;
+ * RXGK_Level level;
+ * unsigned int epoch;
+ * unsigned int cid;
+ * unsigned int call_numbers<>;
+ * };
+ */
+static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn,
+ const struct krb5_enctype *krb5,
+ struct sk_buff *skb,
+ __be32 *p, __be32 *end)
+{
+ u32 app_len, call_count, level, epoch, cid, i;
+
+ _enter("");
+
+ if (memcmp(p, conn->rxgk.nonce, 20) != 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_nonce);
+ p += 20 / sizeof(__be32);
+
+ app_len = ntohl(*p++);
+ if (app_len > (end - p) * sizeof(__be32))
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_applen);
+
+ p += xdr_round_up(app_len) / sizeof(__be32);
+ if (end - p < 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_applen);
+
+ level = ntohl(*p++);
+ epoch = ntohl(*p++);
+ cid = ntohl(*p++);
+ call_count = ntohl(*p++);
+
+ if (level != conn->security_level ||
+ epoch != conn->proto.epoch ||
+ cid != conn->proto.cid ||
+ call_count > 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_param);
+
+ if (end - p < call_count)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_call_list);
+
+ for (i = 0; i < call_count; i++) {
+ u32 call_id = ntohl(*p++);
+
+ if (call_id > INT_MAX)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_bad_callid);
+
+ if (call_id < conn->channels[i].call_counter)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_call_ctr);
+
+ if (call_id > conn->channels[i].call_counter) {
+ if (conn->channels[i].call)
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_call_state);
+
+ conn->channels[i].call_counter = call_id;
+ }
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Extract the authenticator and verify it.
+ */
+static int rxgk_verify_authenticator(struct rxrpc_connection *conn,
+ const struct krb5_enctype *krb5,
+ struct sk_buff *skb,
+ unsigned int auth_offset, unsigned int auth_len)
+{
+ void *auth;
+ __be32 *p;
+ int ret;
+
+ auth = kmalloc(auth_len, GFP_NOFS);
+ if (!auth)
+ return -ENOMEM;
+
+ ret = skb_copy_bits(skb, auth_offset, auth, auth_len);
+ if (ret < 0) {
+ ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO,
+ rxgk_abort_resp_short_auth);
+ goto error;
+ }
+
+ p = auth;
+ ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len);
+error:
+ kfree(auth);
+ return ret;
+}
+
+/*
+ * Verify a response.
+ *
+ * struct RXGK_Response {
+ * rxgkTime start_time;
+ * RXGK_Data token;
+ * opaque authenticator<RXGK_MAXAUTHENTICATOR>
+ * };
+ */
+static int rxgk_verify_response(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ const struct krb5_enctype *krb5;
+ struct rxrpc_key_token *token;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxgk_response rhdr;
+ struct rxgk_context *gk;
+ struct key *key = NULL;
+ unsigned int offset = sizeof(struct rxrpc_wire_header);
+ unsigned int len = skb->len - sizeof(struct rxrpc_wire_header);
+ unsigned int token_offset, token_len;
+ unsigned int auth_offset, auth_len;
+ __be32 xauth_len;
+ int ret, ec;
+
+ _enter("{%d}", conn->debug_id);
+
+ /* Parse the RXGK_Response object */
+ if (sizeof(rhdr) + sizeof(__be32) > len)
+ goto short_packet;
+
+ if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0)
+ goto short_packet;
+ offset += sizeof(rhdr);
+ len -= sizeof(rhdr);
+
+ token_offset = offset;
+ token_len = ntohl(rhdr.token_len);
+ if (xdr_round_up(token_len) + sizeof(__be32) > len)
+ goto short_packet;
+
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len);
+
+ offset += xdr_round_up(token_len);
+ len -= xdr_round_up(token_len);
+
+ if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0)
+ goto short_packet;
+ offset += sizeof(xauth_len);
+ len -= sizeof(xauth_len);
+
+ auth_offset = offset;
+ auth_len = ntohl(xauth_len);
+ if (auth_len < len)
+ goto short_packet;
+ if (auth_len & 3)
+ goto inconsistent;
+ if (auth_len < 20 + 9 * 4)
+ goto auth_too_short;
+
+ /* We need to extract and decrypt the token and instantiate a session
+ * key for it. This bit, however, is application-specific. If
+ * possible, we use a default parser, but we might end up bumping this
+ * to the app to deal with - which might mean a round trip to
+ * userspace.
+ */
+ ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key);
+ if (ret < 0)
+ goto out;
+
+ /* We now have a key instantiated from the decrypted ticket. We can
+ * pass this to the application so that they can parse the ticket
+ * content and we can use the session key it contains to derive the
+ * keys we need.
+ *
+ * Note that we have to switch enctype at this point as the enctype of
+ * the ticket doesn't necessarily match that of the transport.
+ */
+ token = key->payload.data[0];
+ conn->security_level = token->rxgk->level;
+ conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time);
+
+ gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS);
+ if (IS_ERR(gk)) {
+ ret = PTR_ERR(gk);
+ goto cant_get_token;
+ }
+
+ krb5 = gk->krb5;
+
+ trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len);
+
+ /* Decrypt, parse and verify the authenticator. */
+ ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb,
+ &auth_offset, &auth_len, &ec);
+ if (ret < 0) {
+ rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret,
+ rxgk_abort_resp_auth_dec);
+ goto out;
+ }
+
+ ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len);
+ if (ret < 0)
+ goto out;
+
+ conn->key = key;
+ key = NULL;
+ ret = 0;
+out:
+ key_put(key);
+ _leave(" = %d", ret);
+ return ret;
+
+inconsistent:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_xdr_align);
+ goto out;
+auth_too_short:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_short_auth);
+ goto out;
+short_packet:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_short_packet);
+ goto out;
+
+cant_get_token:
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -EINVAL:
+ ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_internal_error);
+ goto out;
+ case -ENOPKG:
+ ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP,
+ -EKEYREJECTED, rxgk_abort_resp_nopkg);
+ goto out;
+ }
+
+temporary_error:
+ /* Ignore the response packet if we got a temporary error such as
+ * ENOMEM. We just want to send the challenge again. Note that we
+ * also come out this way if the ticket decryption fails.
+ */
+ goto out;
+}
+
+/*
+ * clear the connection security
+ */
+static void rxgk_clear(struct rxrpc_connection *conn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++)
+ rxgk_put(conn->rxgk.keys[i]);
+}
+
+/*
+ * Initialise the RxGK security service.
+ */
+static int rxgk_init(void)
+{
+ return 0;
+}
+
+/*
+ * Clean up the RxGK security service.
+ */
+static void rxgk_exit(void)
+{
+}
+
+/*
+ * RxRPC YFS GSSAPI-based security
+ */
+const struct rxrpc_security rxgk_yfs = {
+ .name = "yfs-rxgk",
+ .security_index = RXRPC_SECURITY_YFS_RXGK,
+ .no_key_abort = RXGK_NOTAUTH,
+ .init = rxgk_init,
+ .exit = rxgk_exit,
+ .preparse_server_key = rxgk_preparse_server_key,
+ .free_preparse_server_key = rxgk_free_preparse_server_key,
+ .destroy_server_key = rxgk_destroy_server_key,
+ .describe_server_key = rxgk_describe_server_key,
+ .init_connection_security = rxgk_init_connection_security,
+ .alloc_txbuf = rxgk_alloc_txbuf,
+ .secure_packet = rxgk_secure_packet,
+ .verify_packet = rxgk_verify_packet,
+ .free_call_crypto = rxgk_free_call_crypto,
+ .issue_challenge = rxgk_issue_challenge,
+ .validate_challenge = rxgk_validate_challenge,
+ .challenge_to_recvmsg = rxgk_challenge_to_recvmsg,
+ .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge,
+ .respond_to_challenge = rxgk_respond_to_challenge_no_appdata,
+ .verify_response = rxgk_verify_response,
+ .clear = rxgk_clear,
+ .default_decode_ticket = rxgk_yfs_decode_ticket,
+};
diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c
new file mode 100644
index 000000000000..b94b77a1c317
--- /dev/null
+++ b/net/rxrpc/rxgk_app.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Application-specific bits for GSSAPI-based RxRPC security
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/key-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+/*
+ * Decode a default-style YFS ticket in a response and turn it into an
+ * rxrpc-type key.
+ *
+ * struct rxgk_key {
+ * afs_uint32 enctype;
+ * opaque key<>;
+ * };
+ *
+ * struct RXGK_AuthName {
+ * afs_int32 kind;
+ * opaque data<AUTHDATAMAX>;
+ * opaque display<AUTHPRINTABLEMAX>;
+ * };
+ *
+ * struct RXGK_Token {
+ * rxgk_key K0;
+ * RXGK_Level level;
+ * rxgkTime starttime;
+ * afs_int32 lifetime;
+ * afs_int32 bytelife;
+ * rxgkTime expirationtime;
+ * struct RXGK_AuthName identities<>;
+ * };
+ */
+int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key)
+{
+ struct rxrpc_key_token *token;
+ const struct cred *cred = current_cred(); // TODO - use socket creds
+ struct key *key;
+ size_t pre_ticket_len, payload_len;
+ unsigned int klen, enctype;
+ void *payload, *ticket;
+ __be32 *t, *p, *q, tmp[2];
+ int ret;
+
+ _enter("");
+
+ /* Get the session key length */
+ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp));
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_klen);
+ enctype = ntohl(tmp[0]);
+ klen = ntohl(tmp[1]);
+
+ if (klen > ticket_len - 10 * sizeof(__be32))
+ return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_key);
+
+ pre_ticket_len = ((5 + 14) * sizeof(__be32) +
+ xdr_round_up(klen) +
+ sizeof(__be32));
+ payload_len = pre_ticket_len + xdr_round_up(ticket_len);
+
+ payload = kzalloc(payload_len, GFP_NOFS);
+ if (!payload)
+ return -ENOMEM;
+
+ /* We need to fill out the XDR form for a key payload that we can pass
+ * to add_key(). Start by copying in the ticket so that we can parse
+ * it.
+ */
+ ticket = payload + pre_ticket_len;
+ ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len);
+ if (ret < 0) {
+ ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+ rxgk_abort_resp_short_yfs_tkt);
+ goto error;
+ }
+
+ /* Fill out the form header. */
+ p = payload;
+ p[0] = htonl(0); /* Flags */
+ p[1] = htonl(1); /* len(cellname) */
+ p[2] = htonl(0x20000000); /* Cellname " " */
+ p[3] = htonl(1); /* #tokens */
+ p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) +
+ xdr_round_up(ticket_len)); /* Token len */
+
+ /* Now fill in the body. Most of this we can just scrape directly from
+ * the ticket.
+ */
+ t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen);
+ q = payload + 5 * sizeof(__be32);
+ q[0] = htonl(RXRPC_SECURITY_YFS_RXGK);
+ q[1] = t[1]; /* begintime - msw */
+ q[2] = t[2]; /* - lsw */
+ q[3] = t[5]; /* endtime - msw */
+ q[4] = t[6]; /* - lsw */
+ q[5] = 0; /* level - msw */
+ q[6] = t[0]; /* - lsw */
+ q[7] = 0; /* lifetime - msw */
+ q[8] = t[3]; /* - lsw */
+ q[9] = 0; /* bytelife - msw */
+ q[10] = t[4]; /* - lsw */
+ q[11] = 0; /* enctype - msw */
+ q[12] = htonl(enctype); /* - lsw */
+ q[13] = htonl(klen); /* Key length */
+
+ q += 14;
+
+ memcpy(q, ticket + sizeof(__be32) * 2, klen);
+ q += xdr_round_up(klen) / 4;
+ q[0] = htonl(ticket_len);
+ q++;
+ if (WARN_ON((unsigned long)q != (unsigned long)ticket)) {
+ ret = -EIO;
+ goto error;
+ }
+
+ /* Ticket read in with skb_copy_bits above */
+ q += xdr_round_up(ticket_len) / 4;
+ if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) {
+ ret = -EIO;
+ goto error;
+ }
+
+ /* Now turn that into a key. */
+ key = key_alloc(&key_type_rxrpc, "x",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(key)) {
+ _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
+ ret = PTR_ERR(key);
+ goto error;
+ }
+
+ _debug("key %d", key_serial(key));
+
+ ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL);
+ if (ret < 0)
+ goto error_key;
+
+ token = key->payload.data[0];
+ token->no_leak_key = true;
+ *_key = key;
+ key = NULL;
+ ret = 0;
+ goto error;
+
+error_key:
+ key_put(key);
+error:
+ kfree_sensitive(payload);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Extract the token and set up a session key from the details.
+ *
+ * struct RXGK_TokenContainer {
+ * afs_int32 kvno;
+ * afs_int32 enctype;
+ * opaque encrypted_token<>;
+ * };
+ *
+ * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1]
+ */
+int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int token_offset, unsigned int token_len,
+ struct key **_key)
+{
+ const struct krb5_enctype *krb5;
+ const struct krb5_buffer *server_secret;
+ struct crypto_aead *token_enc = NULL;
+ struct key *server_key;
+ unsigned int ticket_offset, ticket_len;
+ u32 kvno, enctype;
+ int ret, ec;
+
+ struct {
+ __be32 kvno;
+ __be32 enctype;
+ __be32 token_len;
+ } container;
+
+ /* Decode the RXGK_TokenContainer object. This tells us which server
+ * key we should be using. We can then fetch the key, get the secret
+ * and set up the crypto to extract the token.
+ */
+ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0)
+ return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_tok_short);
+
+ kvno = ntohl(container.kvno);
+ enctype = ntohl(container.enctype);
+ ticket_len = ntohl(container.token_len);
+ ticket_offset = token_offset + sizeof(container);
+
+ if (xdr_round_up(ticket_len) > token_len - 3 * 4)
+ return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+ rxgk_abort_resp_tok_short);
+
+ _debug("KVNO %u", kvno);
+ _debug("ENC %u", enctype);
+ _debug("TLEN %u", ticket_len);
+
+ server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype);
+ if (IS_ERR(server_key))
+ goto cant_get_server_key;
+
+ down_read(&server_key->sem);
+ server_secret = (const void *)&server_key->payload.data[2];
+ ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS);
+ up_read(&server_key->sem);
+ key_put(server_key);
+ if (ret < 0)
+ goto cant_get_token;
+
+ /* We can now decrypt and parse the token/ticket. This allows us to
+ * gain access to K0, from which we can derive the transport key and
+ * thence decode the authenticator.
+ */
+ ret = rxgk_decrypt_skb(krb5, token_enc, skb,
+ &ticket_offset, &ticket_len, &ec);
+ crypto_free_aead(token_enc);
+ token_enc = NULL;
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, skb, ec, ret,
+ rxgk_abort_resp_tok_dec);
+
+ ret = conn->security->default_decode_ticket(conn, skb, ticket_offset,
+ ticket_len, _key);
+ if (ret < 0)
+ goto cant_get_token;
+
+ _leave(" = 0");
+ return ret;
+
+cant_get_server_key:
+ ret = PTR_ERR(server_key);
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -ENOKEY:
+ case -EKEYREJECTED:
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EPERM:
+ return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED,
+ rxgk_abort_resp_tok_nokey);
+ default:
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_tok_keyerr);
+ }
+
+cant_get_token:
+ switch (ret) {
+ case -ENOMEM:
+ goto temporary_error;
+ case -EINVAL:
+ return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED,
+ rxgk_abort_resp_tok_internal_error);
+ case -ENOPKG:
+ return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP,
+ -EKEYREJECTED, rxgk_abort_resp_tok_nopkg);
+ }
+
+temporary_error:
+ /* Ignore the response packet if we got a temporary error such as
+ * ENOMEM. We just want to send the challenge again. Note that we
+ * also come out this way if the ticket decryption fails.
+ */
+ return ret;
+}
diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h
new file mode 100644
index 000000000000..7370a5655985
--- /dev/null
+++ b/net/rxrpc/rxgk_common.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Common bits for GSSAPI-based RxRPC security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <crypto/krb5.h>
+#include <crypto/skcipher.h>
+#include <crypto/hash.h>
+
+/*
+ * Per-key number context. This is replaced when the connection is rekeyed.
+ */
+struct rxgk_context {
+ refcount_t usage;
+ unsigned int key_number; /* Rekeying number (goes in the rx header) */
+ unsigned long flags;
+#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */
+ unsigned long expiry; /* Expiration time of this key */
+ long long bytes_remaining; /* Remaining Tx lifetime of this key */
+ const struct krb5_enctype *krb5; /* RxGK encryption type */
+ const struct rxgk_key *key;
+
+ /* We need up to 7 keys derived from the transport key, but we don't
+ * actually need the transport key. Each key is derived by
+ * DK(TK,constant).
+ */
+ struct crypto_aead *tx_enc; /* Transmission key */
+ struct crypto_aead *rx_enc; /* Reception key */
+ struct crypto_shash *tx_Kc; /* Transmission checksum key */
+ struct crypto_shash *rx_Kc; /* Reception checksum key */
+ struct crypto_aead *resp_enc; /* Response packet enc key */
+};
+
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_object_len(x) (4 + xdr_round_up(x))
+
+/*
+ * rxgk_app.c
+ */
+int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int ticket_offset, unsigned int ticket_len,
+ struct key **_key);
+int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
+ unsigned int token_offset, unsigned int token_len,
+ struct key **_key);
+
+/*
+ * rxgk_kdf.c
+ */
+void rxgk_put(struct rxgk_context *gk);
+struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn,
+ const struct rxgk_key *key,
+ unsigned int key_number,
+ gfp_t gfp);
+int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key,
+ struct crypto_aead **token_key,
+ unsigned int enctype,
+ const struct krb5_enctype **_krb5,
+ gfp_t gfp);
+
+/*
+ * Apply decryption and checksumming functions to part of an skbuff. The
+ * offset and length are updated to reflect the actual content of the encrypted
+ * region.
+ */
+static inline
+int rxgk_decrypt_skb(const struct krb5_enctype *krb5,
+ struct crypto_aead *aead,
+ struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len,
+ int *_error_code)
+{
+ struct scatterlist sg[16];
+ size_t offset = 0, len = *_len;
+ int nr_sg, ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+
+ ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg,
+ &offset, &len);
+ switch (ret) {
+ case 0:
+ *_offset += offset;
+ *_len = len;
+ break;
+ case -EPROTO:
+ case -EBADMSG:
+ *_error_code = RXGK_SEALEDINCON;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Check the MIC on a region of an skbuff. The offset and length are updated
+ * to reflect the actual content of the secure region.
+ */
+static inline
+int rxgk_verify_mic_skb(const struct krb5_enctype *krb5,
+ struct crypto_shash *shash,
+ const struct krb5_buffer *metadata,
+ struct sk_buff *skb,
+ unsigned int *_offset, unsigned int *_len,
+ u32 *_error_code)
+{
+ struct scatterlist sg[16];
+ size_t offset = 0, len = *_len;
+ int nr_sg, ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nr_sg = skb_to_sgvec(skb, sg, *_offset, len);
+ if (unlikely(nr_sg < 0))
+ return nr_sg;
+
+ ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg,
+ &offset, &len);
+ switch (ret) {
+ case 0:
+ *_offset += offset;
+ *_len = len;
+ break;
+ case -EPROTO:
+ case -EBADMSG:
+ *_error_code = RXGK_SEALEDINCON;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c
new file mode 100644
index 000000000000..b4db5aa30e5b
--- /dev/null
+++ b/net/rxrpc/rxgk_kdf.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RxGK transport key derivation.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/key-type.h>
+#include <linux/slab.h>
+#include <keys/rxrpc-type.h>
+#include "ar-internal.h"
+#include "rxgk_common.h"
+
+#define round16(x) (((x) + 15) & ~15)
+
+/*
+ * Constants used to derive the keys and hmacs actually used for doing stuff.
+ */
+#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402
+#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403
+#define RXGK_SERVER_ENC_PACKET 1028U // 0x404
+#define RXGK_SERVER_MIC_PACKET 1029U // 0x405
+#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+
+static void rxgk_free(struct rxgk_context *gk)
+{
+ if (gk->tx_Kc)
+ crypto_free_shash(gk->tx_Kc);
+ if (gk->rx_Kc)
+ crypto_free_shash(gk->rx_Kc);
+ if (gk->tx_enc)
+ crypto_free_aead(gk->tx_enc);
+ if (gk->rx_enc)
+ crypto_free_aead(gk->rx_enc);
+ if (gk->resp_enc)
+ crypto_free_aead(gk->resp_enc);
+ kfree(gk);
+}
+
+void rxgk_put(struct rxgk_context *gk)
+{
+ if (gk && refcount_dec_and_test(&gk->usage))
+ rxgk_free(gk);
+}
+
+/*
+ * Transport key derivation function.
+ *
+ * TK = random-to-key(PRF+(K0, L,
+ * epoch || cid || start_time || key_number))
+ * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3]
+ */
+static int rxgk_derive_transport_key(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ const struct rxgk_key *rxgk,
+ struct krb5_buffer *TK,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5 = gk->krb5;
+ struct krb5_buffer conn_info;
+ unsigned int L = krb5->key_bytes;
+ __be32 *info;
+ u8 *buffer;
+ int ret;
+
+ _enter("");
+
+ conn_info.len = sizeof(__be32) * 5;
+
+ buffer = kzalloc(round16(conn_info.len), gfp);
+ if (!buffer)
+ return -ENOMEM;
+
+ conn_info.data = buffer;
+
+ info = (__be32 *)conn_info.data;
+ info[0] = htonl(conn->proto.epoch);
+ info[1] = htonl(conn->proto.cid);
+ info[2] = htonl(conn->rxgk.start_time >> 32);
+ info[3] = htonl(conn->rxgk.start_time >> 0);
+ info[4] = htonl(gk->key_number);
+
+ ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp);
+ kfree_sensitive(buffer);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Set up the ciphers for the usage keys.
+ */
+static int rxgk_set_up_ciphers(struct rxrpc_connection *conn,
+ struct rxgk_context *gk,
+ const struct rxgk_key *rxgk,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5 = gk->krb5;
+ struct crypto_shash *shash;
+ struct crypto_aead *aead;
+ struct krb5_buffer TK;
+ bool service = rxrpc_conn_is_service(conn);
+ int ret;
+ u8 *buffer;
+
+ buffer = kzalloc(krb5->key_bytes, gfp);
+ if (!buffer)
+ return -ENOMEM;
+
+ TK.len = krb5->key_bytes;
+ TK.data = buffer;
+
+ ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp);
+ if (ret < 0)
+ goto out;
+
+ aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->resp_enc = aead;
+
+ if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len ||
+ crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) {
+ pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n",
+ crypto_aead_blocksize(gk->resp_enc), krb5->block_len,
+ crypto_aead_authsize(gk->resp_enc), krb5->cksum_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (service) {
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_AUTH:
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->tx_Kc = shash;
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->rx_Kc = shash;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->tx_enc = aead;
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->rx_enc = aead;
+ break;
+ }
+ } else {
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_AUTH:
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->tx_Kc = shash;
+ shash = crypto_krb5_prepare_checksum(
+ krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp);
+ if (IS_ERR(shash))
+ goto hash_error;
+ gk->rx_Kc = shash;
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->tx_enc = aead;
+ aead = crypto_krb5_prepare_encryption(
+ krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp);
+ if (IS_ERR(aead))
+ goto aead_error;
+ gk->rx_enc = aead;
+ break;
+ }
+ }
+
+ ret = 0;
+out:
+ kfree_sensitive(buffer);
+ return ret;
+aead_error:
+ ret = PTR_ERR(aead);
+ goto out;
+hash_error:
+ ret = PTR_ERR(shash);
+ goto out;
+}
+
+/*
+ * Derive a transport key for a connection and then derive a bunch of usage
+ * keys from it and set up ciphers using them.
+ */
+struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn,
+ const struct rxgk_key *key,
+ unsigned int key_number,
+ gfp_t gfp)
+{
+ struct rxgk_context *gk;
+ unsigned long lifetime;
+ int ret = -ENOPKG;
+
+ _enter("");
+
+ gk = kzalloc(sizeof(*gk), GFP_KERNEL);
+ if (!gk)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&gk->usage, 1);
+ gk->key = key;
+ gk->key_number = key_number;
+
+ gk->krb5 = crypto_krb5_find_enctype(key->enctype);
+ if (!gk->krb5)
+ goto err_tk;
+
+ ret = rxgk_set_up_ciphers(conn, gk, key, gfp);
+ if (ret)
+ goto err_tk;
+
+ /* Set the remaining number of bytes encrypted with this key that may
+ * be transmitted before rekeying. Note that the spec has been
+ * interpreted differently on this point...
+ */
+ switch (key->bytelife) {
+ case 0:
+ case 63:
+ gk->bytes_remaining = LLONG_MAX;
+ break;
+ case 1 ... 62:
+ gk->bytes_remaining = 1LL << key->bytelife;
+ break;
+ default:
+ gk->bytes_remaining = key->bytelife;
+ break;
+ }
+
+ /* Set the time after which rekeying must occur */
+ if (key->lifetime) {
+ lifetime = min_t(u64, key->lifetime, INT_MAX / HZ);
+ lifetime *= HZ;
+ } else {
+ lifetime = MAX_JIFFY_OFFSET;
+ }
+ gk->expiry = jiffies + lifetime;
+ return gk;
+
+err_tk:
+ rxgk_put(gk);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Use the server secret key to set up the ciphers that will be used to extract
+ * the token from a response packet.
+ */
+int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key,
+ struct crypto_aead **token_aead,
+ unsigned int enctype,
+ const struct krb5_enctype **_krb5,
+ gfp_t gfp)
+{
+ const struct krb5_enctype *krb5;
+ struct crypto_aead *aead;
+
+ krb5 = crypto_krb5_find_enctype(enctype);
+ if (!krb5)
+ return -ENOPKG;
+
+ aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp);
+ if (IS_ERR(aead))
+ return PTR_ERR(aead);
+
+ *_krb5 = krb5;
+ *token_aead = aead;
+ return 0;
+}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6cb37b0eb77f..3657c0661cdc 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -177,8 +177,10 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem
if (!txb)
return NULL;
- txb->offset += shdr;
- txb->space = part;
+ txb->crypto_header = 0;
+ txb->sec_header = shdr;
+ txb->offset += shdr;
+ txb->space = part;
return txb;
}
@@ -683,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
serial = rxrpc_get_next_serial(conn);
whdr.serial = htonl(serial);
+ trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce);
+
ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
@@ -698,62 +702,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
}
/*
- * send a Kerberos security response
- */
-static int rxkad_send_response(struct rxrpc_connection *conn,
- struct rxrpc_host_header *hdr,
- struct rxkad_response *resp,
- const struct rxkad_key *s2)
-{
- struct rxrpc_wire_header whdr;
- struct msghdr msg;
- struct kvec iov[3];
- size_t len;
- u32 serial;
- int ret;
-
- _enter("");
-
- msg.msg_name = &conn->peer->srx.transport;
- msg.msg_namelen = conn->peer->srx.transport_len;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = 0;
-
- memset(&whdr, 0, sizeof(whdr));
- whdr.epoch = htonl(hdr->epoch);
- whdr.cid = htonl(hdr->cid);
- whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
- whdr.flags = conn->out_clientflag;
- whdr.securityIndex = hdr->securityIndex;
- whdr.serviceId = htons(hdr->serviceId);
-
- iov[0].iov_base = &whdr;
- iov[0].iov_len = sizeof(whdr);
- iov[1].iov_base = resp;
- iov[1].iov_len = sizeof(*resp);
- iov[2].iov_base = (void *)s2->ticket;
- iov[2].iov_len = s2->ticket_len;
-
- len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
-
- serial = rxrpc_get_next_serial(conn);
- whdr.serial = htonl(serial);
-
- rxrpc_local_dont_fragment(conn->local, false);
- ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len);
- if (ret < 0) {
- trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_point_rxkad_response);
- return -EAGAIN;
- }
-
- conn->peer->last_tx_at = ktime_get_seconds();
- _leave(" = 0");
- return 0;
-}
-
-/*
* calculate the response checksum
*/
static void rxkad_calc_response_checksum(struct rxkad_response *response)
@@ -772,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response)
* encrypt the response packet
*/
static int rxkad_encrypt_response(struct rxrpc_connection *conn,
- struct rxkad_response *resp,
+ struct sk_buff *response,
const struct rxkad_key *s2)
{
struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg[1];
+ size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted);
+ int ret;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ ret = skb_to_sgvec(response, sg,
+ sizeof(struct rxrpc_wire_header) +
+ offsetof(struct rxkad_response, encrypted), encsize);
+ if (ret < 0)
+ return ret;
req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS);
if (!req)
@@ -786,89 +743,206 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn,
/* continue encrypting from where we left off */
memcpy(&iv, s2->session_key, sizeof(iv));
- sg_init_table(sg, 1);
- sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted));
skcipher_request_set_sync_tfm(req, conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
- crypto_skcipher_encrypt(req);
+ skcipher_request_set_crypt(req, sg, sg, encsize, iv.x);
+ ret = crypto_skcipher_encrypt(req);
skcipher_request_free(req);
- return 0;
+ return ret;
}
/*
- * respond to a challenge packet
+ * Validate a challenge packet.
*/
-static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
- struct sk_buff *skb)
+static bool rxkad_validate_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
{
- const struct rxrpc_key_token *token;
struct rxkad_challenge challenge;
- struct rxkad_response *resp;
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- u32 version, nonce, min_level;
- int ret = -EPROTO;
+ u32 version, min_level;
+ int ret;
_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
- if (!conn->key)
- return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
- rxkad_abort_chall_no_key);
+ if (!conn->key) {
+ rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO,
+ rxkad_abort_chall_no_key);
+ return false;
+ }
ret = key_validate(conn->key);
- if (ret < 0)
- return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret,
- rxkad_abort_chall_key_expired);
+ if (ret < 0) {
+ rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret,
+ rxkad_abort_chall_key_expired);
+ return false;
+ }
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
- &challenge, sizeof(challenge)) < 0)
- return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
- rxkad_abort_chall_short);
+ &challenge, sizeof(challenge)) < 0) {
+ rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
+ rxkad_abort_chall_short);
+ return false;
+ }
version = ntohl(challenge.version);
- nonce = ntohl(challenge.nonce);
+ sp->chall.rxkad_nonce = ntohl(challenge.nonce);
min_level = ntohl(challenge.min_level);
- trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level);
+ trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version,
+ sp->chall.rxkad_nonce, min_level);
+
+ if (version != RXKAD_VERSION) {
+ rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
+ rxkad_abort_chall_version);
+ return false;
+ }
- if (version != RXKAD_VERSION)
- return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
- rxkad_abort_chall_version);
+ if (conn->security_level < min_level) {
+ rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES,
+ rxkad_abort_chall_level);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Insert the header into the response.
+ */
+static noinline
+int rxkad_insert_response_header(struct rxrpc_connection *conn,
+ const struct rxrpc_key_token *token,
+ struct sk_buff *challenge,
+ struct sk_buff *response,
+ size_t *offset)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+ struct {
+ struct rxrpc_wire_header whdr;
+ struct rxkad_response resp;
+ } h;
+ int ret;
- if (conn->security_level < min_level)
- return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES,
- rxkad_abort_chall_level);
+ h.whdr.epoch = htonl(conn->proto.epoch);
+ h.whdr.cid = htonl(conn->proto.cid);
+ h.whdr.callNumber = 0;
+ h.whdr.serial = 0;
+ h.whdr.seq = 0;
+ h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
+ h.whdr.flags = conn->out_clientflag;
+ h.whdr.userStatus = 0;
+ h.whdr.securityIndex = conn->security_ix;
+ h.whdr.cksum = 0;
+ h.whdr.serviceId = htons(conn->service_id);
+ h.resp.version = htonl(RXKAD_VERSION);
+ h.resp.__pad = 0;
+ h.resp.encrypted.epoch = htonl(conn->proto.epoch);
+ h.resp.encrypted.cid = htonl(conn->proto.cid);
+ h.resp.encrypted.checksum = 0;
+ h.resp.encrypted.securityIndex = htonl(conn->security_ix);
+ h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
+ h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
+ h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
+ h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
+ h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1);
+ h.resp.encrypted.level = htonl(conn->security_level);
+ h.resp.kvno = htonl(token->kad->kvno);
+ h.resp.ticket_len = htonl(token->kad->ticket_len);
+
+ rxkad_calc_response_checksum(&h.resp);
+
+ ret = skb_store_bits(response, *offset, &h, sizeof(h));
+ *offset += sizeof(h);
+ return ret;
+}
+
+/*
+ * respond to a challenge packet
+ */
+static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *challenge)
+{
+ const struct rxrpc_key_token *token;
+ struct rxrpc_skb_priv *csp, *rsp;
+ struct sk_buff *response;
+ size_t len, offset = 0;
+ int ret = -EPROTO;
+
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ ret = key_validate(conn->key);
+ if (ret < 0)
+ return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret,
+ rxkad_abort_chall_key_expired);
token = conn->key->payload.data[0];
/* build the response packet */
- resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
- if (!resp)
- return -ENOMEM;
+ len = sizeof(struct rxrpc_wire_header) +
+ sizeof(struct rxkad_response) +
+ token->kad->ticket_len;
+
+ response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS);
+ if (!response)
+ goto error;
+ rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad);
+ response->len = len;
+ response->data_len = len;
+
+ offset = 0;
+ ret = rxkad_insert_response_header(conn, token, challenge, response,
+ &offset);
+ if (ret < 0)
+ goto error;
+
+ ret = rxkad_encrypt_response(conn, response, token->kad);
+ if (ret < 0)
+ goto error;
+
+ ret = skb_store_bits(response, offset, token->kad->ticket,
+ token->kad->ticket_len);
+ if (ret < 0)
+ goto error;
- resp->version = htonl(RXKAD_VERSION);
- resp->encrypted.epoch = htonl(conn->proto.epoch);
- resp->encrypted.cid = htonl(conn->proto.cid);
- resp->encrypted.securityIndex = htonl(conn->security_ix);
- resp->encrypted.inc_nonce = htonl(nonce + 1);
- resp->encrypted.level = htonl(conn->security_level);
- resp->kvno = htonl(token->kad->kvno);
- resp->ticket_len = htonl(token->kad->ticket_len);
- resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
- resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
- resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
- resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
-
- /* calculate the response checksum and then do the encryption */
- rxkad_calc_response_checksum(resp);
- ret = rxkad_encrypt_response(conn, resp, token->kad);
- if (ret == 0)
- ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
- kfree(resp);
+ csp = rxrpc_skb(challenge);
+ rsp = rxrpc_skb(response);
+ rsp->resp.len = len;
+ rsp->resp.challenge_serial = csp->hdr.serial;
+ rxrpc_post_response(conn, response);
+ response = NULL;
+ ret = 0;
+
+error:
+ rxrpc_free_skb(response, rxrpc_skb_put_response);
return ret;
}
/*
+ * RxKAD does automatic response only as there's nothing to manage that isn't
+ * already in the key.
+ */
+static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge,
+ struct msghdr *msg)
+{
+ return -EINVAL;
+}
+
+/**
+ * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata
+ * @challenge: The challenge to respond to
+ *
+ * Allow a kernel application to respond to a CHALLENGE.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
+ */
+int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge)
+{
+ struct rxrpc_skb_priv *csp = rxrpc_skb(challenge);
+
+ return rxkad_respond_to_challenge(csp->chall.conn, challenge);
+}
+EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge);
+
+/*
* decrypt the kerberos IV ticket in the response
*/
static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
@@ -1276,6 +1350,8 @@ const struct rxrpc_security rxkad = {
.verify_packet = rxkad_verify_packet,
.free_call_crypto = rxkad_free_call_crypto,
.issue_challenge = rxkad_issue_challenge,
+ .validate_challenge = rxkad_validate_challenge,
+ .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
.verify_response = rxkad_verify_response,
.clear = rxkad_clear,
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index e848a4777b8c..0377301156b0 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "rxperf: " fmt
#include <linux/module.h>
#include <linux/slab.h>
+#include <crypto/krb5.h>
#include <net/sock.h>
#include <net/af_rxrpc.h>
#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
@@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock,
RXPERF_CALL_SV_AWAIT_ACK);
}
+static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = {
+ .notify_new_call = rxperf_rx_new_call,
+ .discard_new_call = rxperf_rx_discard_new_call,
+ .user_attach_call = rxperf_rx_attach,
+};
+
/*
* Charge the incoming call preallocation.
*/
@@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work)
if (rxrpc_kernel_charge_accept(rxperf_socket,
rxperf_notify_rx,
- rxperf_rx_attach,
(unsigned long)call,
GFP_KERNEL,
call->debug_id) < 0)
@@ -209,8 +215,7 @@ static int rxperf_open_socket(void)
if (ret < 0)
goto error_2;
- rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call,
- rxperf_rx_discard_new_call);
+ rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops);
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
@@ -546,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call)
}
/*
- * Add a key to the security keyring.
+ * Add an rxkad key to the security keyring.
*/
-static int rxperf_add_key(struct key *keyring)
+static int rxperf_add_rxkad_key(struct key *keyring)
{
key_ref_t kref;
int ret;
@@ -574,6 +579,47 @@ static int rxperf_add_key(struct key *keyring)
return ret;
}
+#ifdef CONFIG_RXGK
+/*
+ * Add a yfs-rxgk key to the security keyring.
+ */
+static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype)
+{
+ const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype);
+ key_ref_t kref;
+ char name[64];
+ int ret;
+ u8 key[32];
+
+ if (!krb5 || krb5->key_len > sizeof(key))
+ return 0;
+
+ /* The key is just { 0, 1, 2, 3, 4, ... } */
+ for (int i = 0; i < krb5->key_len; i++)
+ key[i] = i;
+
+ sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype);
+
+ kref = key_create_or_update(make_key_ref(keyring, true),
+ "rxrpc_s", name,
+ key, krb5->key_len,
+ KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+
+ if (IS_ERR(kref)) {
+ pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref));
+ return PTR_ERR(kref);
+ }
+
+ ret = key_link(keyring, key_ref_to_ptr(kref));
+ if (ret < 0)
+ pr_err("Can't link rxperf server key: %d\n", ret);
+ key_ref_put(kref);
+ return ret;
+}
+#endif
+
/*
* Initialise the rxperf server.
*/
@@ -603,9 +649,29 @@ static int __init rxperf_init(void)
goto error_keyring;
}
rxperf_sec_keyring = keyring;
- ret = rxperf_add_key(keyring);
+ ret = rxperf_add_rxkad_key(keyring);
+ if (ret < 0)
+ goto error_key;
+#ifdef CONFIG_RXGK
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC);
+ if (ret < 0)
+ goto error_key;
+ ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC);
if (ret < 0)
goto error_key;
+#endif
ret = rxperf_open_socket();
if (ret < 0)
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index 9784adc8f275..078d91a6b77f 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = {
#ifdef CONFIG_RXKAD
[RXRPC_SECURITY_RXKAD] = &rxkad,
#endif
+#ifdef CONFIG_RXGK
+ [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs,
+#endif
};
int __init rxrpc_init_security(void)
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 84dc6c94f23b..ebbb78b842de 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -607,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
static struct rxrpc_call *
rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
struct rxrpc_send_params *p)
- __releases(&rx->sk.sk_lock.slock)
+ __releases(&rx->sk.sk_lock)
__acquires(&call->user_mutex)
{
struct rxrpc_conn_parameters cp;
@@ -657,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
* - the socket may be either a client socket or a server socket
*/
int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
- __releases(&rx->sk.sk_lock.slock)
{
struct rxrpc_call *call;
bool dropped_lock = false;
@@ -759,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
if (rxrpc_call_is_complete(call)) {
/* it's too late for this call */
ret = -ESHUTDOWN;
- } else if (p.command == RXRPC_CMD_SEND_ABORT) {
+ goto out_put_unlock;
+ }
+
+ switch (p.command) {
+ case RXRPC_CMD_SEND_ABORT:
rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED,
rxrpc_abort_call_sendmsg);
ret = 0;
- } else if (p.command != RXRPC_CMD_SEND_DATA) {
- ret = -EINVAL;
- } else {
+ break;
+ case RXRPC_CMD_SEND_DATA:
ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
out_put_unlock:
@@ -794,6 +800,8 @@ error_release_sock:
* appropriate to sending data. No control data should be supplied in @msg,
* nor should an address be supplied. MSG_MORE should be flagged if there's
* more data to come, otherwise this data will end the transmission phase.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
*/
int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
struct msghdr *msg, size_t len,
@@ -829,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data);
* @error: Local error value
* @why: Indication as to why.
*
- * Allow a kernel service to abort a call, if it's still in an abortable state
- * and return true if the call was aborted, false if it was already complete.
+ * Allow a kernel service to abort a call if it's still in an abortable state.
+ *
+ * Return: %true if the call was aborted, %false if it was already complete.
*/
bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call,
u32 abort_code, int error, enum rxrpc_abort_reason why)
diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c
index e51940589ee5..36b05fd842a7 100644
--- a/net/rxrpc/server_key.c
+++ b/net/rxrpc/server_key.c
@@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
*
* Set the server security keyring on an rxrpc socket. This is used to provide
* the encryption keys for a kernel service.
+ *
+ * Return: %0 if successful and a negative error code otherwise.
*/
int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
{
@@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
return ret;
}
EXPORT_SYMBOL(rxrpc_sock_set_security_keyring);
+
+/**
+ * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service
+ * @sk: The socket to set the keyring on
+ * @set: True to set, false to clear the flag
+ *
+ * Set the flag on an rxrpc socket to say that the caller wants to manage the
+ * RESPONSE packet and the user-defined data it may contain. Setting this
+ * means that recvmsg() will return messages with RXRPC_CHALLENGED in the
+ * control message buffer containing information about the challenge.
+ *
+ * The user should respond to the challenge by passing RXRPC_RESPOND or
+ * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call.
+ * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be
+ * included to indicate the parts the user wants to supply.
+ *
+ * The server will be passed the response data with a RXRPC_RESPONDED control
+ * message when it gets the first data from each call.
+ *
+ * Note that this is only honoured by security classes that need auxiliary data
+ * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond
+ * without consulting userspace.
+ *
+ * Return: The previous setting.
+ */
+int rxrpc_sock_set_manage_response(struct sock *sk, bool set)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret;
+
+ lock_sock(sk);
+ ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ if (set)
+ set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ else
+ clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags);
+ release_sock(sk);
+ return ret;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_manage_response);
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c550991d48fa..29767038691a 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -60,14 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
return txb;
}
-void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
-{
- int r;
-
- __refcount_inc(&txb->ref, &r);
- trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what);
-}
-
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
{
int r = refcount_read(&txb->ref);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a800127effcd..ad914d2b2e22 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -403,6 +403,18 @@ config NET_SCH_ETS
If unsure, say N.
+config NET_SCH_BPF
+ bool "BPF-based Qdisc"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ help
+ This option allows BPF-based queueing disiplines. With BPF struct_ops,
+ users can implement supported operators in Qdisc_ops using BPF programs.
+ The queue holding skb can be built with BPF maps or graphs.
+
+ Say Y here if you want to use BPF-based Qdisc.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
help
@@ -784,7 +796,7 @@ config NET_ACT_SKBEDIT
config NET_ACT_CSUM
tristate "Checksum Updating"
depends on NET_CLS_ACT && INET
- select CRC32
+ select NET_CRC32C
help
Say Y here to update some common checksum after some direct
packet alterations.
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 82c3f78ca486..904d784902d1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 839790043256..057e20cef375 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1461,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {};
- struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 2];
struct tc_action *act;
size_t sz = 0;
int err;
int i;
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL,
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL,
extack);
if (err < 0)
return err;
+ /* The nested attributes are parsed as types, but they are really an
+ * array of actions. So we parse one more than we can handle, and return
+ * an error if the last one is set (as that indicates that the request
+ * contained more than the maximum number of actions).
+ */
+ if (tb[TCA_ACT_MAX_PRIO + 1]) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Only %d actions supported per filter",
+ TCA_ACT_MAX_PRIO);
+ return -EINVAL;
+ }
+
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
struct tc_action_ops *a_o;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5b3814365924..5f01f567c934 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
#define MIRRED_NEST_LIMIT 4
-static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
+
+#ifndef CONFIG_PREEMPT_RT
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
+}
+
+#else
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return current->net_xmit.sched_mirred_nest++;
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ current->net_xmit.sched_mirred_nest--;
+}
+#endif
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
int m_eaction;
u32 blockid;
- nest_level = __this_cpu_inc_return(mirred_nest_level);
+ nest_level = tcf_mirred_nest_level_inc_return();
if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
netdev_name(skb->dev));
@@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
retval);
dec_nest_level:
- __this_cpu_dec(mirred_nest_level);
+ tcf_mirred_nest_level_dec();
return retval;
}
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
new file mode 100644
index 000000000000..7ea8b54b2ab1
--- /dev/null
+++ b/net/sched/bpf_qdisc.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void)))
+#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void)))
+
+static struct bpf_struct_ops bpf_Qdisc_ops;
+
+struct bpf_sched_data {
+ struct qdisc_watchdog watchdog;
+};
+
+struct bpf_sk_buff_ptr {
+ struct sk_buff *skb;
+};
+
+static int bpf_qdisc_init(struct btf *btf)
+{
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr)
+
+static bool bpf_qdisc_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ struct btf *btf = prog->aux->attach_btf;
+ u32 arg;
+
+ arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+ if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
+ if (arg == 2 && type == BPF_READ) {
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ info->btf = btf;
+ info->btf_id = bpf_sk_buff_ptr_ids[0];
+ return true;
+ }
+ }
+
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct Qdisc, limit):
+ *end = offsetofend(struct Qdisc, limit);
+ break;
+ case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen):
+ *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen);
+ break;
+ case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1:
+ *end = offsetofend(struct Qdisc, qstats);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, size_t *end)
+{
+ switch (off) {
+ case offsetof(struct sk_buff, tstamp):
+ *end = offsetofend(struct sk_buff, tstamp);
+ break;
+ case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ...
+ offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb,
+ data[QDISC_CB_PRIV_LEN - 1]):
+ *end = offsetof(struct sk_buff, cb) +
+ offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]);
+ break;
+ default:
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t, *skbt, *qdisct;
+ size_t end;
+ int err;
+
+ skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]);
+ qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+
+ if (t == skbt) {
+ err = bpf_qdisc_sk_buff_access(log, reg, off, &end);
+ } else if (t == qdisct) {
+ err = bpf_qdisc_qdisc_access(log, reg, off, &end);
+ } else {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ if (err) {
+ bpf_log(log, "no write support to %s at off %d\n",
+ btf_name_by_offset(reg->btf, t->name_off), off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of %s ended at %zu\n",
+ off, size, btf_name_by_offset(reg->btf, t->name_off), end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+BTF_ID_LIST(bpf_qdisc_init_prologue_ids)
+BTF_ID(func, bpf_qdisc_init_prologue)
+
+static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init))
+ return 0;
+
+ /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx".
+ * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_init_prologue(r1, r2);
+ * if r0 == 0 goto pc+1;
+ * BPF_EXIT;
+ * r1 = r6; // r1 will be "u64 *ctx".
+ */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
+}
+
+BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids)
+BTF_ID(func, bpf_qdisc_reset_destroy_epilogue)
+
+static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog,
+ s16 ctx_stack_off)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) &&
+ prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy))
+ return 0;
+
+ /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx"
+ * r1 = r1[0]; // r1 will be "struct Qdisc *sch"
+ * r0 = bpf_qdisc_reset_destroy_epilogue(r1);
+ * BPF_EXIT;
+ */
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
+__bpf_kfunc_start_defs();
+
+/* bpf_skb_get_hash - Get the flow hash of an skb.
+ * @skb: The skb to get the flow hash from.
+ */
+__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb)
+{
+ return skb_get_hash(skb);
+}
+
+/* bpf_kfree_skb - Release an skb's reference and drop it immediately.
+ * @skb: The skb whose reference to be released and dropped.
+ */
+__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+}
+
+/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list.
+ * @skb: The skb whose reference to be released and dropped.
+ * @to_free_list: The list of skbs to be dropped.
+ */
+__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb,
+ struct bpf_sk_buff_ptr *to_free_list)
+{
+ __qdisc_drop(skb, (struct sk_buff **)to_free_list);
+}
+
+/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer.
+ * @sch: The qdisc to be scheduled.
+ * @expire: The expiry time of the timer.
+ * @delta_ns: The slack range of the timer.
+ */
+__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns);
+}
+
+/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */
+__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch,
+ struct netlink_ext_ack *extack)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *p;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (sch->parent != TC_H_ROOT) {
+ /* If qdisc_lookup() returns NULL, it means .init is called by
+ * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc
+ * has not been added to qdisc_hash yet.
+ */
+ p = qdisc_lookup(dev, TC_H_MAJ(sch->parent));
+ if (p && !(p->flags & TCQ_F_MQROOT)) {
+ NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset
+ * and .destroy
+ */
+__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch)
+{
+ struct bpf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+/* bpf_qdisc_bstats_update - Update Qdisc basic statistics
+ * @sch: The qdisc from which an skb is dequeued.
+ * @skb: The skb to be dequeued.
+ */
+__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb)
+{
+ bstats_update(&sch->bstats, skb);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(qdisc_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(qdisc_kfunc_ids)
+
+BTF_SET_START(qdisc_common_kfunc_set)
+BTF_ID(func, bpf_skb_get_hash)
+BTF_ID(func, bpf_kfree_skb)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_SET_END(qdisc_common_kfunc_set)
+
+BTF_SET_START(qdisc_enqueue_kfunc_set)
+BTF_ID(func, bpf_qdisc_skb_drop)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_SET_END(qdisc_enqueue_kfunc_set)
+
+BTF_SET_START(qdisc_dequeue_kfunc_set)
+BTF_ID(func, bpf_qdisc_watchdog_schedule)
+BTF_ID(func, bpf_qdisc_bstats_update)
+BTF_SET_END(qdisc_dequeue_kfunc_set)
+
+enum qdisc_ops_kf_flags {
+ QDISC_OPS_KF_COMMON = 0,
+ QDISC_OPS_KF_ENQUEUE = 1 << 0,
+ QDISC_OPS_KF_DEQUEUE = 1 << 1,
+};
+
+static const u32 qdisc_ops_context_flags[] = {
+ [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE,
+ [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE,
+ [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON,
+ [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON,
+};
+
+static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ u32 moff, flags;
+
+ if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id))
+ return 0;
+
+ if (prog->aux->st_ops != &bpf_Qdisc_ops)
+ return -EACCES;
+
+ moff = prog->aux->attach_st_ops_member_off;
+ flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)];
+
+ if ((flags & QDISC_OPS_KF_ENQUEUE) &&
+ btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id))
+ return 0;
+
+ if ((flags & QDISC_OPS_KF_DEQUEUE) &&
+ btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id))
+ return 0;
+
+ if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id))
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &qdisc_kfunc_ids,
+ .filter = bpf_qdisc_kfunc_filter,
+};
+
+static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = bpf_qdisc_is_valid_access,
+ .btf_struct_access = bpf_qdisc_btf_struct_access,
+ .gen_prologue = bpf_qdisc_gen_prologue,
+ .gen_epilogue = bpf_qdisc_gen_epilogue,
+};
+
+static int bpf_qdisc_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct Qdisc_ops *uqdisc_ops;
+ struct Qdisc_ops *qdisc_ops;
+ u32 moff;
+
+ uqdisc_ops = (const struct Qdisc_ops *)udata;
+ qdisc_ops = (struct Qdisc_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct Qdisc_ops, priv_size):
+ if (uqdisc_ops->priv_size)
+ return -EINVAL;
+ qdisc_ops->priv_size = sizeof(struct bpf_sched_data);
+ return 1;
+ case offsetof(struct Qdisc_ops, peek):
+ qdisc_ops->peek = qdisc_peek_dequeued;
+ return 0;
+ case offsetof(struct Qdisc_ops, id):
+ if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id,
+ sizeof(qdisc_ops->id)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_qdisc_reg(void *kdata, struct bpf_link *link)
+{
+ return register_qdisc(kdata);
+}
+
+static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
+{
+ return unregister_qdisc(kdata);
+}
+
+static int bpf_qdisc_validate(void *kdata)
+{
+ struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata;
+
+ if (!ops->enqueue || !ops->dequeue || !ops->init ||
+ !ops->reset || !ops->destroy)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ return 0;
+}
+
+static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void Qdisc_ops__reset(struct Qdisc *sch)
+{
+}
+
+static void Qdisc_ops__destroy(struct Qdisc *sch)
+{
+}
+
+static struct Qdisc_ops __bpf_ops_qdisc_ops = {
+ .enqueue = Qdisc_ops__enqueue,
+ .dequeue = Qdisc_ops__dequeue,
+ .init = Qdisc_ops__init,
+ .reset = Qdisc_ops__reset,
+ .destroy = Qdisc_ops__destroy,
+};
+
+static struct bpf_struct_ops bpf_Qdisc_ops = {
+ .verifier_ops = &bpf_qdisc_verifier_ops,
+ .reg = bpf_qdisc_reg,
+ .unreg = bpf_qdisc_unreg,
+ .validate = bpf_qdisc_validate,
+ .init_member = bpf_qdisc_init_member,
+ .init = bpf_qdisc_init,
+ .name = "Qdisc_ops",
+ .cfi_stubs = &__bpf_ops_qdisc_ops,
+ .owner = THIS_MODULE,
+};
+
+BTF_ID_LIST(bpf_sk_buff_dtor_ids)
+BTF_ID(func, bpf_kfree_skb)
+
+static int __init bpf_qdisc_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = {
+ {
+ .btf_id = bpf_sk_buff_ids[0],
+ .kfunc_btf_id = bpf_sk_buff_dtor_ids[0]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors,
+ ARRAY_SIZE(skb_kfunc_dtors),
+ THIS_MODULE);
+ ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops);
+
+ return ret;
+}
+late_initcall(bpf_qdisc_kfunc_init);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f74a097f54ae..c5e3673aadbe 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -25,6 +25,7 @@
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
+#include <linux/bpf.h>
#include <net/netdev_lock.h>
#include <net/net_namespace.h>
@@ -207,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name)
for (q = qdisc_base; q; q = q->next) {
if (!strcmp(name, q->id)) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -237,7 +238,7 @@ int qdisc_set_default(const char *name)
if (ops) {
/* Set new default */
- module_put(default_qdisc_ops->owner);
+ bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner);
default_qdisc_ops = ops;
}
write_unlock(&qdisc_mod_lock);
@@ -359,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
read_lock(&qdisc_mod_lock);
for (q = qdisc_base; q; q = q->next) {
if (nla_strcmp(kind, q->id) == 0) {
- if (!try_module_get(q->owner))
+ if (!bpf_try_module_get(q, q->owner))
q = NULL;
break;
}
@@ -1370,7 +1371,7 @@ err_out3:
netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
err_out2:
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
err_out:
*errp = err;
return NULL;
@@ -1782,7 +1783,7 @@ static void request_qdisc_module(struct nlattr *kind)
ops = qdisc_lookup_ops(kind);
if (ops) {
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
return;
}
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index ce63414185fd..d1d87dce7f3f 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -16,14 +16,18 @@ struct sch_frag_data {
unsigned int l2_len;
u8 l2_data[VLAN_ETH_HLEN];
int (*xmit)(struct sk_buff *skb);
+ local_lock_t bh_lock;
};
-static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage);
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+ lockdep_assert_held(&data->bh_lock);
if (skb_cow_head(skb, data->l2_len) < 0) {
kfree_skb(skb);
return -ENOMEM;
@@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
struct rtable sch_frag_rt = { 0 };
unsigned long orig_dst;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
IPCB(skb)->frag_max_size = mru;
ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
unsigned long orig_dst;
struct rt6_info sch_frag_rt;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
@@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else {
net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 514b1b6ac681..08e0e3aff976 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -24,6 +24,7 @@
#include <linux/if_vlan.h>
#include <linux/skb_array.h>
#include <linux/if_macvlan.h>
+#include <linux/bpf.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -1001,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
{
struct Qdisc *sch;
- if (!try_module_get(ops->owner)) {
+ if (!bpf_try_module_get(ops, ops->owner)) {
NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
return NULL;
}
sch = qdisc_alloc(dev_queue, ops, extack);
if (IS_ERR(sch)) {
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
return NULL;
}
sch->parent = parentid;
@@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
ops->destroy(qdisc);
lockdep_unregister_key(&qdisc->root_lock_key);
- module_put(ops->owner);
+ bpf_module_put(ops, ops->owner);
netdev_put(dev, &qdisc->dev_tracker);
trace_qdisc_destroy(qdisc);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 7986145a527c..5a7745170e84 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -175,6 +175,11 @@ struct hfsc_sched {
#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
+static bool cl_in_el_or_vttree(struct hfsc_class *cl)
+{
+ return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) ||
+ ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node));
+}
/*
* eligible tree holds backlogged classes being sorted by their eligible times.
@@ -1040,6 +1045,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl == NULL)
return -ENOBUFS;
+ RB_CLEAR_NODE(&cl->el_node);
+
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
@@ -1572,7 +1579,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
sch->qstats.backlog += len;
sch->q.qlen++;
- if (first && !cl->cl_nactive) {
+ if (first && !cl_in_el_or_vttree(cl)) {
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
if (cl->cl_flags & HFSC_FSC)
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index d18a72df3654..24d5a35ce894 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -7,10 +7,10 @@ menuconfig IP_SCTP
tristate "The SCTP Protocol"
depends on INET
depends on IPV6 || IPV6=n
- select CRC32
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_SHA1
+ select NET_CRC32C
select NET_UDP_TUNNEL
help
Stream Control Transmission Protocol
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 760152e751c7..5793d71852b8 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -736,24 +736,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
return peer;
}
-/* Delete a transport address from an association. */
-void sctp_assoc_del_peer(struct sctp_association *asoc,
- const union sctp_addr *addr)
-{
- struct list_head *pos;
- struct list_head *temp;
- struct sctp_transport *transport;
-
- list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
- transport = list_entry(pos, struct sctp_transport, transports);
- if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
- /* Do book keeping for removing the peer and free it. */
- sctp_assoc_rm_peer(asoc, transport);
- break;
- }
- }
-}
-
/* Lookup a transport by address. */
struct sctp_transport *sctp_assoc_lookup_paddr(
const struct sctp_association *asoc,
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 502095173d88..e6f863c031b4 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -111,7 +111,6 @@ int __init sctp_offload_init(void)
if (ret)
goto ipv4;
- crc32c_csum_stub = &sctp_csum_ops;
return ret;
ipv4:
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index f80208edd6a5..3ead591c72fd 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -115,14 +115,6 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
skb->destructor = sctp_control_release_owner;
}
-/* What was the inbound interface for this chunk? */
-int sctp_chunk_iif(const struct sctp_chunk *chunk)
-{
- struct sk_buff *skb = chunk->skb;
-
- return SCTP_INPUT_CB(skb)->af->skb_iif(skb);
-}
-
/* RFC 2960 3.3.2 Initiation (INIT) (1)
*
* Note 2: The ECN capable field is reserved for future use of
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 53725ee7ba06..1e5739858c20 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5627,7 +5627,8 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv
}
/* Helper routine to branch off an association to a new socket. */
-int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
+static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id,
+ struct socket **sockp)
{
struct sctp_association *asoc = sctp_id2assoc(sk, id);
struct sctp_sock *sp = sctp_sk(sk);
@@ -5675,7 +5676,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
return err;
}
-EXPORT_SYMBOL(sctp_do_peeloff);
static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff,
struct file **newfile, unsigned flags)
@@ -8321,7 +8321,7 @@ static int sctp_hash(struct sock *sk)
static void sctp_unhash(struct sock *sk)
{
- /* STUB */
+ sock_rps_delete_flow(sk);
}
/* Check if port is acceptable. Possibly find first available port.
@@ -9100,7 +9100,8 @@ static void __sctp_write_space(struct sctp_association *asoc)
wq = rcu_dereference(sk->sk_wq);
if (wq) {
if (waitqueue_active(&wq->wait))
- wake_up_interruptible(&wq->wait);
+ wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
/* Note that we try to include the Async I/O support
* here by modeling from the current TCP/UDP code.
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 95696f42647e..d946bfb424c7 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -485,19 +485,6 @@ int strp_init(struct strparser *strp, struct sock *sk,
}
EXPORT_SYMBOL_GPL(strp_init);
-/* Sock process lock held (lock_sock) */
-void __strp_unpause(struct strparser *strp)
-{
- strp->paused = 0;
-
- if (strp->need_bytes) {
- if (strp_peek_len(strp) < strp->need_bytes)
- return;
- }
- strp_read_sock(strp);
-}
-EXPORT_SYMBOL_GPL(__strp_unpause);
-
void strp_unpause(struct strparser *strp)
{
strp->paused = 0;
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 8584893b4785..f4cfe88670f5 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -425,7 +425,7 @@ static void tipc_aead_free(struct rcu_head *rp)
}
free_percpu(aead->tfm_entry);
kfree_sensitive(aead->key);
- kfree(aead);
+ kfree_sensitive(aead);
}
static int tipc_aead_users(struct tipc_aead __rcu *aead)
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 18be6ff4c3db..3ee44d731700 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2228,7 +2228,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
break;
if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
break;
- strncpy(if_name, data, TIPC_MAX_IF_NAME);
+ strscpy(if_name, data, TIPC_MAX_IF_NAME);
/* Update own tolerance if peer indicates a non-zero value */
if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
diff --git a/net/tipc/node.c b/net/tipc/node.c
index ccf5e427f43e..cb43f2016a70 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1581,7 +1581,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
tipc_node_read_lock(node);
link = node->links[bearer_id].link;
if (link) {
- strncpy(linkname, tipc_link_name(link), len);
+ strscpy(linkname, tipc_link_name(link), len);
err = 0;
}
tipc_node_read_unlock(node);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 59a64b2ced6e..2e2e9997a68e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -807,6 +807,11 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
spin_unlock(&sk->sk_peer_lock);
}
+static bool unix_may_passcred(const struct sock *sk)
+{
+ return sk->sk_scm_credentials || sk->sk_scm_pidfd;
+}
+
static int unix_listen(struct socket *sock, int backlog)
{
int err;
@@ -997,13 +1002,6 @@ static void unix_close(struct sock *sk, long timeout)
*/
}
-static void unix_unhash(struct sock *sk)
-{
- /* Nothing to do here, unix socket does not need a ->unhash().
- * This is merely for sockmap.
- */
-}
-
static bool unix_bpf_bypass_getsockopt(int level, int optname)
{
if (level == SOL_SOCKET) {
@@ -1034,7 +1032,6 @@ struct proto unix_stream_proto = {
.owner = THIS_MODULE,
.obj_size = sizeof(struct unix_sock),
.close = unix_close,
- .unhash = unix_unhash,
.bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = unix_stream_bpf_update_proto,
@@ -1065,6 +1062,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sock_init_data(sock, sk);
+ sk->sk_scm_rights = 1;
sk->sk_hash = unix_unbound_hash(sk);
sk->sk_allocation = GFP_KERNEL_ACCOUNT;
sk->sk_write_space = unix_write_space;
@@ -1492,9 +1490,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
if (err)
goto out;
- if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
- !READ_ONCE(unix_sk(sk)->addr)) {
+ if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
@@ -1613,9 +1609,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (err)
goto out;
- if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
- !READ_ONCE(u->addr)) {
+ if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
@@ -1711,10 +1705,12 @@ restart:
/* The way is open! Fastly set all the necessary fields... */
sock_hold(sk);
- unix_peer(newsk) = sk;
- newsk->sk_state = TCP_ESTABLISHED;
- newsk->sk_type = sk->sk_type;
+ unix_peer(newsk) = sk;
+ newsk->sk_state = TCP_ESTABLISHED;
+ newsk->sk_type = sk->sk_type;
+ newsk->sk_scm_recv_flags = other->sk_scm_recv_flags;
init_peercred(newsk, &peercred);
+
newu = unix_sk(newsk);
newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
@@ -1808,17 +1804,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
return 0;
}
-static void unix_sock_inherit_flags(const struct socket *old,
- struct socket *new)
-{
- if (test_bit(SOCK_PASSCRED, &old->flags))
- set_bit(SOCK_PASSCRED, &new->flags);
- if (test_bit(SOCK_PASSPIDFD, &old->flags))
- set_bit(SOCK_PASSPIDFD, &new->flags);
- if (test_bit(SOCK_PASSSEC, &old->flags))
- set_bit(SOCK_PASSSEC, &new->flags);
-}
-
static int unix_accept(struct socket *sock, struct socket *newsock,
struct proto_accept_arg *arg)
{
@@ -1855,7 +1840,6 @@ static int unix_accept(struct socket *sock, struct socket *newsock,
unix_state_lock(tsk);
unix_update_edges(unix_sk(tsk));
newsock->state = SS_CONNECTED;
- unix_sock_inherit_flags(sock, newsock);
sock_graft(tsk, newsock);
unix_state_unlock(tsk);
return 0;
@@ -1964,7 +1948,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
{
int err = 0;
- UNIXCB(skb).pid = get_pid(scm->pid);
+ UNIXCB(skb).pid = get_pid(scm->pid);
UNIXCB(skb).uid = scm->creds.uid;
UNIXCB(skb).gid = scm->creds.gid;
UNIXCB(skb).fp = NULL;
@@ -1976,28 +1960,19 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
return err;
}
-static bool unix_passcred_enabled(const struct socket *sock,
- const struct sock *other)
-{
- return test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags) ||
- !other->sk_socket ||
- test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
- test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
-}
-
/*
* Some apps rely on write() giving SCM_CREDENTIALS
* We include credentials if source or destination socket
* asserted SOCK_PASSCRED.
*/
-static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
- const struct sock *other)
+static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk,
+ const struct sock *other)
{
if (UNIXCB(skb).pid)
return;
- if (unix_passcred_enabled(sock, other)) {
- UNIXCB(skb).pid = get_pid(task_tgid(current));
+
+ if (unix_may_passcred(sk) || unix_may_passcred(other)) {
+ UNIXCB(skb).pid = get_pid(task_tgid(current));
current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
}
}
@@ -2073,9 +2048,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
- !READ_ONCE(u->addr)) {
+ if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
err = unix_autobind(sk);
if (err)
goto out;
@@ -2192,6 +2165,11 @@ restart_locked:
goto out_unlock;
}
+ if (UNIXCB(skb).fp && !other->sk_scm_rights) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+
if (sk->sk_type != SOCK_SEQPACKET) {
err = security_unix_may_send(sk->sk_socket, other->sk_socket);
if (err)
@@ -2238,7 +2216,8 @@ restart_locked:
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
- maybe_add_creds(skb, sock, other);
+
+ unix_maybe_add_creds(skb, sk, other);
scm_stat_add(other, skb);
skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
@@ -2266,14 +2245,14 @@ out:
#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
+static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
struct scm_cookie *scm, bool fds_sent)
{
struct unix_sock *ousk = unix_sk(other);
struct sk_buff *skb;
int err;
- skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
+ skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
return err;
@@ -2292,12 +2271,16 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
if (sock_flag(other, SOCK_DEAD) ||
(other->sk_shutdown & RCV_SHUTDOWN)) {
- unix_state_unlock(other);
err = -EPIPE;
- goto out;
+ goto out_unlock;
}
- maybe_add_creds(skb, sock, other);
+ if (UNIXCB(skb).fp && !other->sk_scm_rights) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+
+ unix_maybe_add_creds(skb, sk, other);
scm_stat_add(other, skb);
spin_lock(&other->sk_receive_queue.lock);
@@ -2310,6 +2293,8 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
other->sk_data_ready(other);
return 0;
+out_unlock:
+ unix_state_unlock(other);
out:
consume_skb(skb);
return err;
@@ -2413,7 +2398,13 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
(other->sk_shutdown & RCV_SHUTDOWN))
goto out_pipe_unlock;
- maybe_add_creds(skb, sock, other);
+ if (UNIXCB(skb).fp && !other->sk_scm_rights) {
+ unix_state_unlock(other);
+ err = -EPERM;
+ goto out_free;
+ }
+
+ unix_maybe_add_creds(skb, sk, other);
scm_stat_add(other, skb);
skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
@@ -2423,7 +2414,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (msg->msg_flags & MSG_OOB) {
- err = queue_oob(sock, msg, other, &scm, fds_sent);
+ err = queue_oob(sk, msg, other, &scm, fds_sent);
if (err)
goto out_err;
sent++;
@@ -2945,8 +2936,7 @@ unlock:
/* Never glue messages from different writers */
if (!unix_skb_scm_eq(skb, &scm))
break;
- } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags)) {
+ } else if (unix_may_passcred(sk)) {
/* Copy credentials */
scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
unix_set_secdata(&scm, skb);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index fc6afbc8d680..2e7a3034e965 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1013,6 +1013,39 @@ out:
return err;
}
+void vsock_linger(struct sock *sk)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ ssize_t (*unsent)(struct vsock_sock *vsk);
+ struct vsock_sock *vsk = vsock_sk(sk);
+ long timeout;
+
+ if (!sock_flag(sk, SOCK_LINGER))
+ return;
+
+ timeout = sk->sk_lingertime;
+ if (!timeout)
+ return;
+
+ /* Transports must implement `unsent_bytes` if they want to support
+ * SOCK_LINGER through `vsock_linger()` since we use it to check when
+ * the socket can be closed.
+ */
+ unsent = vsk->transport->unsent_bytes;
+ if (!unsent)
+ return;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+
+ do {
+ if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+EXPORT_SYMBOL_GPL(vsock_linger);
+
static int vsock_shutdown(struct socket *sock, int mode)
{
int err;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7f7de6d88096..1b5d9896edae 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
uarg = msg_zerocopy_realloc(sk_vsock(vsk),
iter->count,
- NULL);
+ NULL, false);
if (!uarg)
return -1;
@@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb,
{
if (zcopy)
return __zerocopy_sg_from_iter(info->msg, NULL, skb,
- &info->msg->msg_iter,
- len);
+ &info->msg->msg_iter, len, NULL);
return memcpy_from_msg(skb_put(skb, len), info->msg, len);
}
@@ -441,18 +440,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
u32 len)
{
- if (vvs->rx_bytes + len > vvs->buf_alloc)
+ if (vvs->buf_used + len > vvs->buf_alloc)
return false;
vvs->rx_bytes += len;
+ vvs->buf_used += len;
return true;
}
static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
- u32 len)
+ u32 bytes_read, u32 bytes_dequeued)
{
- vvs->rx_bytes -= len;
- vvs->fwd_cnt += len;
+ vvs->rx_bytes -= bytes_read;
+ vvs->buf_used -= bytes_dequeued;
+ vvs->fwd_cnt += bytes_dequeued;
}
void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb)
@@ -581,11 +582,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
size_t len)
{
struct virtio_vsock_sock *vvs = vsk->trans;
- size_t bytes, total = 0;
struct sk_buff *skb;
u32 fwd_cnt_delta;
bool low_rx_bytes;
int err = -EFAULT;
+ size_t total = 0;
u32 free_space;
spin_lock_bh(&vvs->rx_lock);
@@ -597,6 +598,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
while (total < len && !skb_queue_empty(&vvs->rx_queue)) {
+ size_t bytes, dequeued = 0;
+
skb = skb_peek(&vvs->rx_queue);
bytes = min_t(size_t, len - total,
@@ -620,12 +623,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes;
if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) {
- u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
-
- virtio_transport_dec_rx_pkt(vvs, pkt_len);
+ dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len);
__skb_unlink(skb, &vvs->rx_queue);
consume_skb(skb);
}
+
+ virtio_transport_dec_rx_pkt(vvs, bytes, dequeued);
}
fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
@@ -781,7 +784,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
msg->msg_flags |= MSG_EOR;
}
- virtio_transport_dec_rx_pkt(vvs, pkt_len);
+ virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
kfree_skb(skb);
}
@@ -1192,23 +1195,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk)
vsock_remove_sock(vsk);
}
-static void virtio_transport_wait_close(struct sock *sk, long timeout)
-{
- if (timeout) {
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- add_wait_queue(sk_sleep(sk), &wait);
-
- do {
- if (sk_wait_event(sk, &timeout,
- sock_flag(sk, SOCK_DONE), &wait))
- break;
- } while (!signal_pending(current) && timeout);
-
- remove_wait_queue(sk_sleep(sk), &wait);
- }
-}
-
static void virtio_transport_cancel_close_work(struct vsock_sock *vsk,
bool cancel_timeout)
{
@@ -1278,8 +1264,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk)
if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
(void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
- if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
- virtio_transport_wait_close(sk, sk->sk_lingertime);
+ if (!(current->flags & PF_EXITING))
+ vsock_linger(sk);
if (sock_flag(sk, SOCK_DONE)) {
return true;
@@ -1735,6 +1721,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
struct sock *sk = sk_vsock(vsk);
struct virtio_vsock_hdr *hdr;
struct sk_buff *skb;
+ u32 pkt_len;
int off = 0;
int err;
@@ -1752,7 +1739,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)
vvs->msg_count--;
- virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len));
+ pkt_len = le32_to_cpu(hdr->len);
+ virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
spin_unlock_bh(&vvs->rx_lock);
virtio_transport_send_credit_update(vsk);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f039a7d0d6f7..fd5f79266471 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -469,6 +469,8 @@ nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = {
[NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 },
[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 },
[NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG },
+ [NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID] =
+ NLA_POLICY_MAX(NLA_U8, IEEE80211_MLD_MAX_NUM_LINKS),
};
static const struct nla_policy
@@ -833,6 +835,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
[NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
+ [NL80211_ATTR_EML_CAPABILITY] = { .type = NLA_U16 },
[NL80211_ATTR_PUNCT_BITMAP] =
NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range),
@@ -5523,11 +5526,13 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
static int nl80211_parse_mbssid_config(struct wiphy *wiphy,
struct net_device *dev,
+ unsigned int link_id,
struct nlattr *attrs,
struct cfg80211_mbssid_config *config,
u8 num_elems)
{
struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1];
+ int tx_link_id = -1;
if (!wiphy->mbssid_max_interfaces)
return -EOPNOTSUPP;
@@ -5551,6 +5556,9 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy,
(!config->index && !num_elems))
return -EINVAL;
+ if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID])
+ tx_link_id = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]);
+
if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) {
u32 tx_ifindex =
nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]);
@@ -5572,10 +5580,25 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy,
}
config->tx_wdev = tx_netdev->ieee80211_ptr;
+ /* Caller should call dev_put(config->tx_wdev) from this point */
+
+ if (config->tx_wdev->valid_links) {
+ if (tx_link_id == -1 ||
+ !(config->tx_wdev->valid_links & BIT(tx_link_id)))
+ return -ENOLINK;
+
+ config->tx_link_id = tx_link_id;
+ }
} else {
+ if (tx_link_id >= 0 && tx_link_id != link_id)
+ return -EINVAL;
+
config->tx_wdev = dev->ieee80211_ptr;
}
} else if (!config->index) {
+ if (tx_link_id >= 0 && tx_link_id != link_id)
+ return -EINVAL;
+
config->tx_wdev = dev->ieee80211_ptr;
} else {
return -EINVAL;
@@ -6325,7 +6348,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
}
if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) {
- err = nl80211_parse_mbssid_config(&rdev->wiphy, dev,
+ err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, link_id,
info->attrs[NL80211_ATTR_MBSSID_CONFIG],
&params->mbssid_config,
params->beacon.mbssid_ies ?
@@ -7118,6 +7141,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EINVAL;
}
+ /* Accept EMLSR capabilities only for AP client before association */
+ if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
+ params->eml_cap_present)
+ return -EINVAL;
+
switch (statype) {
case CFG80211_STA_AP_MLME_CLIENT:
/* Use this only for authorizing/unauthorizing a station */
@@ -7473,6 +7501,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
params.link_sta_params.he_6ghz_capa =
nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) {
+ params.eml_cap_present = true;
+ params.eml_cap =
+ nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]);
+ }
+
if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
params.airtime_weight =
nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);
@@ -7631,6 +7665,12 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
}
}
+ if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) {
+ params.eml_cap_present = true;
+ params.eml_cap =
+ nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]);
+ }
+
if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
params.link_sta_params.he_6ghz_capa =
nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index c5181a9044ad..aa9788f20d0d 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -267,13 +267,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
void xp_clear_dev(struct xsk_buff_pool *pool)
{
+ struct net_device *netdev = pool->netdev;
+
if (!pool->netdev)
return;
+ netdev_lock_ops(netdev);
xp_disable_drv_zc(pool);
xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
- dev_put(pool->netdev);
pool->netdev = NULL;
+ netdev_unlock_ops(netdev);
+ dev_put(netdev);
}
static void xp_release_deferred(struct work_struct *work)
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index d62f76161d83..81fd486b5e56 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -145,10 +145,6 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
return NULL;
}
- /* This skb was already validated on the upper/virtual dev */
- if ((x->xso.dev != dev) && (x->xso.real_dev == dev))
- return skb;
-
local_irq_save(flags);
sd = this_cpu_ptr(&softnet_data);
err = !skb_queue_empty(&sd->xfrm_backlog);
@@ -159,8 +155,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
return skb;
}
- if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
- unlikely(xmit_xfrm_check_overflow(skb)))) {
+ if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) {
struct sk_buff *segs;
/* Packet got rerouted, fixup features and segment it. */
@@ -256,6 +251,11 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
return -EINVAL;
}
+ if (xuo->flags & XFRM_OFFLOAD_INBOUND && x->if_id) {
+ NL_SET_ERR_MSG(extack, "XFRM if_id is not supported in RX path");
+ return -EINVAL;
+ }
+
is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;
/* We don't yet support TFC padding. */
@@ -314,7 +314,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
xso->dev = dev;
netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
- xso->real_dev = dev;
if (xuo->flags & XFRM_OFFLOAD_INBOUND)
xso->dir = XFRM_DEV_OFFLOAD_IN;
@@ -326,11 +325,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
else
xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
- err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack);
+ err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack);
if (err) {
xso->dev = NULL;
xso->dir = 0;
- xso->real_dev = NULL;
netdev_put(dev, &xso->dev_tracker);
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
@@ -378,7 +376,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
xdo->dev = dev;
netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
- xdo->real_dev = dev;
xdo->type = XFRM_DEV_OFFLOAD_PACKET;
switch (dir) {
case XFRM_POLICY_IN:
@@ -400,7 +397,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack);
if (err) {
xdo->dev = NULL;
- xdo->real_dev = NULL;
xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
xdo->dir = 0;
netdev_put(dev, &xdo->dev_tracker);
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 622445f041d3..cb1e12740c87 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -952,32 +952,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
.get_link_net = xfrmi_get_link_net,
};
-static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list,
- struct list_head *dev_to_kill)
+static void __net_exit xfrmi_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- struct net *net;
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *xi;
+ int i;
- ASSERT_RTNL();
- list_for_each_entry(net, net_exit_list, exit_list) {
- struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
- struct xfrm_if __rcu **xip;
- struct xfrm_if *xi;
- int i;
-
- for (i = 0; i < XFRMI_HASH_SIZE; i++) {
- for (xip = &xfrmn->xfrmi[i];
- (xi = rtnl_dereference(*xip)) != NULL;
- xip = &xi->next)
- unregister_netdevice_queue(xi->dev, dev_to_kill);
- }
- xi = rtnl_dereference(xfrmn->collect_md_xfrmi);
- if (xi)
+ for (i = 0; i < XFRMI_HASH_SIZE; i++) {
+ for (xip = &xfrmn->xfrmi[i];
+ (xi = rtnl_net_dereference(net, *xip)) != NULL;
+ xip = &xi->next)
unregister_netdevice_queue(xi->dev, dev_to_kill);
}
+
+ xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi);
+ if (xi)
+ unregister_netdevice_queue(xi->dev, dev_to_kill);
}
static struct pernet_operations xfrmi_net_ops = {
- .exit_batch_rtnl = xfrmi_exit_batch_rtnl,
+ .exit_rtnl = xfrmi_exit_rtnl,
.id = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
};
diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c
index 82f0a301683f..ebf95d48e86c 100644
--- a/net/xfrm/xfrm_nat_keepalive.c
+++ b/net/xfrm/xfrm_nat_keepalive.c
@@ -9,9 +9,13 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
-static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4);
+static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#if IS_ENABLED(CONFIG_IPV6)
-static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6);
+static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#endif
struct nat_keepalive {
@@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb,
skb_dst_set(skb, &rt->dst);
- sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4);
+ local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
+ sk = this_cpu_read(nat_keepalive_sk_ipv4.sock);
sock_net_set(sk, net);
err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos);
sock_net_set(sk, &init_net);
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
return err;
}
@@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb,
fl6.fl6_sport = ka->encap_sport;
fl6.fl6_dport = ka->encap_dport;
- sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6);
+ local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
+ sk = this_cpu_read(nat_keepalive_sk_ipv6.sock);
sock_net_set(sk, net);
dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL);
- if (IS_ERR(dst))
+ if (IS_ERR(dst)) {
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
return PTR_ERR(dst);
+ }
skb_dst_set(skb, dst);
err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0);
sock_net_set(sk, &init_net);
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
return err;
}
#endif
@@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work)
(ctx.next_run - ctx.now) * HZ);
}
-static int nat_keepalive_sk_init(struct sock * __percpu *socks,
+static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks,
unsigned short family)
{
struct sock *sk;
@@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks,
if (err < 0)
goto err;
- *per_cpu_ptr(socks, i) = sk;
+ per_cpu_ptr(socks, i)->sock = sk;
}
return 0;
err:
for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+ inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
return err;
}
-static void nat_keepalive_sk_fini(struct sock * __percpu *socks)
+static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks)
{
int i;
for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+ inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
}
void xfrm_nat_keepalive_state_updated(struct xfrm_state *x)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f4bad8c895d6..d4134a18c658 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4633,7 +4633,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
struct xfrm_migrate *m, int num_migrate,
struct xfrm_kmaddress *k, struct net *net,
struct xfrm_encap_tmpl *encap, u32 if_id,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo)
{
int i, err, nx_cur = 0, nx_new = 0;
struct xfrm_policy *pol = NULL;
@@ -4666,7 +4666,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
x_cur[nx_cur] = x;
nx_cur++;
- xc = xfrm_state_migrate(x, mp, encap);
+ xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack);
if (xc) {
x_new[nx_new] = xc;
nx_new++;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 07fe8e5daa32..203b585c2ae2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -599,9 +599,9 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
x->mode_cbs->destroy_state(x);
hrtimer_cancel(&x->mtimer);
timer_delete_sync(&x->rtimer);
- kfree(x->aead);
- kfree(x->aalg);
- kfree(x->ealg);
+ kfree_sensitive(x->aead);
+ kfree_sensitive(x->aalg);
+ kfree_sensitive(x->ealg);
kfree(x->calg);
kfree(x->encap);
kfree(x->coaddr);
@@ -767,7 +767,7 @@ void xfrm_dev_state_delete(struct xfrm_state *x)
struct net_device *dev = READ_ONCE(xso->dev);
if (dev) {
- dev->xfrmdev_ops->xdo_dev_state_delete(x);
+ dev->xfrmdev_ops->xdo_dev_state_delete(dev, x);
spin_lock_bh(&xfrm_state_dev_gc_lock);
hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list);
spin_unlock_bh(&xfrm_state_dev_gc_lock);
@@ -789,7 +789,7 @@ void xfrm_dev_state_free(struct xfrm_state *x)
spin_unlock_bh(&xfrm_state_dev_gc_lock);
if (dev->xfrmdev_ops->xdo_dev_state_free)
- dev->xfrmdev_ops->xdo_dev_state_free(x);
+ dev->xfrmdev_ops->xdo_dev_state_free(dev, x);
WRITE_ONCE(xso->dev, NULL);
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
netdev_put(dev, &xso->dev_tracker);
@@ -1548,19 +1548,19 @@ found:
if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
struct xfrm_dev_offload *xdo = &pol->xdo;
struct xfrm_dev_offload *xso = &x->xso;
+ struct net_device *dev = xdo->dev;
xso->type = XFRM_DEV_OFFLOAD_PACKET;
xso->dir = xdo->dir;
- xso->dev = xdo->dev;
- xso->real_dev = xdo->real_dev;
+ xso->dev = dev;
xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
- netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC);
- error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL);
+ netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC);
+ error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x,
+ NULL);
if (error) {
xso->dir = 0;
- netdev_put(xso->dev, &xso->dev_tracker);
+ netdev_put(dev, &xso->dev_tracker);
xso->dev = NULL;
- xso->real_dev = NULL;
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
x->km.state = XFRM_STATE_DEAD;
to_put = x;
@@ -1958,8 +1958,9 @@ static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *secu
return 0;
}
-static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
- struct xfrm_encap_tmpl *encap)
+static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig,
+ struct xfrm_encap_tmpl *encap,
+ struct xfrm_migrate *m)
{
struct net *net = xs_net(orig);
struct xfrm_state *x = xfrm_state_alloc(net);
@@ -2058,6 +2059,11 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
goto error;
}
+
+ x->props.family = m->new_family;
+ memcpy(&x->id.daddr, &m->new_daddr, sizeof(x->id.daddr));
+ memcpy(&x->props.saddr, &m->new_saddr, sizeof(x->props.saddr));
+
return x;
error:
@@ -2120,21 +2126,23 @@ EXPORT_SYMBOL(xfrm_migrate_state_find);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
struct xfrm_migrate *m,
- struct xfrm_encap_tmpl *encap)
+ struct xfrm_encap_tmpl *encap,
+ struct net *net,
+ struct xfrm_user_offload *xuo,
+ struct netlink_ext_ack *extack)
{
struct xfrm_state *xc;
- xc = xfrm_state_clone(x, encap);
+ xc = xfrm_state_clone_and_setup(x, encap, m);
if (!xc)
return NULL;
- xc->props.family = m->new_family;
-
if (xfrm_init_state(xc) < 0)
goto error;
- memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr));
- memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr));
+ /* configure the hardware if offload is requested */
+ if (xuo && xfrm_dev_state_add(net, xc, xuo, extack))
+ goto error;
/* add state */
if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 784a2d124749..59f258daf830 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -178,11 +178,27 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
"Replay seq and seq_hi should be 0 for output SA");
return -EINVAL;
}
- if (rs->oseq_hi && !(p->flags & XFRM_STATE_ESN)) {
- NL_SET_ERR_MSG(
- extack,
- "Replay oseq_hi should be 0 in non-ESN mode for output SA");
- return -EINVAL;
+
+ if (!(p->flags & XFRM_STATE_ESN)) {
+ if (rs->oseq_hi) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay oseq_hi should be 0 in non-ESN mode for output SA");
+ return -EINVAL;
+ }
+ if (rs->oseq == U32_MAX) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay oseq should be less than 0xFFFFFFFF in non-ESN mode for output SA");
+ return -EINVAL;
+ }
+ } else {
+ if (rs->oseq == U32_MAX && rs->oseq_hi == U32_MAX) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay oseq and oseq_hi should be less than 0xFFFFFFFF for output SA");
+ return -EINVAL;
+ }
}
if (rs->bmp_len) {
NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA");
@@ -196,11 +212,27 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
"Replay oseq and oseq_hi should be 0 for input SA");
return -EINVAL;
}
- if (rs->seq_hi && !(p->flags & XFRM_STATE_ESN)) {
- NL_SET_ERR_MSG(
- extack,
- "Replay seq_hi should be 0 in non-ESN mode for input SA");
- return -EINVAL;
+ if (!(p->flags & XFRM_STATE_ESN)) {
+ if (rs->seq_hi) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay seq_hi should be 0 in non-ESN mode for input SA");
+ return -EINVAL;
+ }
+
+ if (rs->seq == U32_MAX) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay seq should be less than 0xFFFFFFFF in non-ESN mode for input SA");
+ return -EINVAL;
+ }
+ } else {
+ if (rs->seq == U32_MAX && rs->seq_hi == U32_MAX) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay seq and seq_hi should be less than 0xFFFFFFFF for input SA");
+ return -EINVAL;
+ }
}
}
@@ -1173,7 +1205,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
if (!nla)
return -EMSGSIZE;
algo = nla_data(nla);
- strscpy_pad(algo->alg_name, auth->alg_name, sizeof(algo->alg_name));
+ strscpy_pad(algo->alg_name, auth->alg_name);
if (redact_secret && auth->alg_key_len)
memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8);
@@ -1186,7 +1218,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
if (!nla)
return -EMSGSIZE;
ap = nla_data(nla);
- strscpy_pad(ap->alg_name, auth->alg_name, sizeof(ap->alg_name));
+ strscpy_pad(ap->alg_name, auth->alg_name);
ap->alg_key_len = auth->alg_key_len;
ap->alg_trunc_len = auth->alg_trunc_len;
if (redact_secret && auth->alg_key_len)
@@ -1207,7 +1239,7 @@ static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb)
return -EMSGSIZE;
ap = nla_data(nla);
- strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name));
+ strscpy_pad(ap->alg_name, aead->alg_name);
ap->alg_key_len = aead->alg_key_len;
ap->alg_icv_len = aead->alg_icv_len;
@@ -1229,7 +1261,7 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb)
return -EMSGSIZE;
ap = nla_data(nla);
- strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name));
+ strscpy_pad(ap->alg_name, ealg->alg_name);
ap->alg_key_len = ealg->alg_key_len;
if (redact_secret && ealg->alg_key_len)
@@ -1250,7 +1282,7 @@ static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb)
return -EMSGSIZE;
ap = nla_data(nla);
- strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name));
+ strscpy_pad(ap->alg_name, calg->alg_name);
ap->alg_key_len = 0;
return 0;
@@ -3069,6 +3101,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
int n = 0;
struct net *net = sock_net(skb->sk);
struct xfrm_encap_tmpl *encap = NULL;
+ struct xfrm_user_offload *xuo = NULL;
u32 if_id = 0;
if (!attrs[XFRMA_MIGRATE]) {
@@ -3099,11 +3132,19 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
if (attrs[XFRMA_IF_ID])
if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+ if (attrs[XFRMA_OFFLOAD_DEV]) {
+ xuo = kmemdup(nla_data(attrs[XFRMA_OFFLOAD_DEV]),
+ sizeof(*xuo), GFP_KERNEL);
+ if (!xuo) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap,
- if_id, extack);
-
+ if_id, extack, xuo);
+error:
kfree(encap);
-
+ kfree(xuo);
return err;
}
#else