aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c11
-rw-r--r--net/8021q/vlan.h3
-rw-r--r--net/8021q/vlan_dev.c4
-rw-r--r--net/8021q/vlan_netlink.c45
-rw-r--r--net/9p/mod.c2
-rw-r--r--net/Kconfig22
-rw-r--r--net/Makefile6
-rw-r--r--net/batman-adv/Kconfig6
-rw-r--r--net/batman-adv/bat_v_elp.c15
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/multicast.c29
-rw-r--r--net/batman-adv/soft-interface.c5
-rw-r--r--net/batman-adv/types.h23
-rw-r--r--net/bluetooth/hci_core.c54
-rw-r--r--net/bluetooth/hci_debugfs.c24
-rw-r--r--net/bluetooth/hci_event.c12
-rw-r--r--net/bluetooth/hci_request.c30
-rw-r--r--net/bluetooth/smp.c12
-rw-r--r--net/bpf/test_run.c3
-rw-r--r--net/bpfilter/Kconfig16
-rw-r--r--net/bpfilter/Makefile32
-rw-r--r--net/bpfilter/bpfilter_kern.c114
-rw-r--r--net/bpfilter/main.c63
-rw-r--r--net/bpfilter/msgfmt.h17
-rw-r--r--net/bridge/br.c16
-rw-r--r--net/bridge/br_fdb.c69
-rw-r--r--net/bridge/br_forward.c6
-rw-r--r--net/bridge/br_if.c11
-rw-r--r--net/bridge/br_input.c1
-rw-r--r--net/bridge/br_netlink.c9
-rw-r--r--net/bridge/br_private.h41
-rw-r--r--net/bridge/br_switchdev.c37
-rw-r--r--net/bridge/br_sysfs_if.c2
-rw-r--r--net/bridge/br_vlan.c144
-rw-r--r--net/bridge/netfilter/Kconfig7
-rw-r--r--net/bridge/netfilter/Makefile1
-rw-r--r--net/bridge/netfilter/ebtables.c63
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c135
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c148
-rw-r--r--net/core/devlink.c111
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c63
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_rules.c495
-rw-r--r--net/core/filter.c1423
-rw-r--r--net/core/flow_dissector.c19
-rw-r--r--net/core/neighbour.c8
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/page_pool.c317
-rw-r--r--net/core/rtnetlink.c34
-rw-r--r--net/core/skbuff.c25
-rw-r--r--net/core/sock.c40
-rw-r--r--net/core/xdp.c299
-rw-r--r--net/dcb/dcbnl.c20
-rw-r--r--net/dccp/minisocks.c1
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/dsa/Kconfig2
-rw-r--r--net/dsa/dsa2.c24
-rw-r--r--net/dsa/dsa_priv.h9
-rw-r--r--net/dsa/master.c62
-rw-r--r--net/dsa/port.c96
-rw-r--r--net/dsa/slave.c307
-rw-r--r--net/ethernet/eth.c6
-rw-r--r--net/ipv4/Makefile5
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/bpfilter/Makefile2
-rw-r--r--net/ipv4/bpfilter/sockopt.c43
-rw-r--r--net/ipv4/devinet.c15
-rw-r--r--net/ipv4/fib_frontend.c58
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c45
-rw-r--r--net/ipv4/fib_trie.c14
-rw-r--r--net/ipv4/inet_connection_sock.c5
-rw-r--r--net/ipv4/ip_gre.c12
-rw-r--r--net/ipv4/ip_output.c45
-rw-r--r--net/ipv4/ip_sockglue.c17
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ipconfig.c150
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/ipmr_base.c8
-rw-r--r--net/ipv4/metrics.c55
-rw-r--r--net/ipv4/netfilter/Kconfig10
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ip_tables.c7
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c88
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c255
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c143
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c147
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c53
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c2
-rw-r--r--net/ipv4/netlink.c23
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/route.c177
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/tcp.c204
-rw-r--r--net/ipv4/tcp_input.c271
-rw-r--r--net/ipv4/tcp_ipv4.c55
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c112
-rw-r--r--net/ipv4/tcp_recovery.c80
-rw-r--r--net/ipv4/tcp_timer.c27
-rw-r--r--net/ipv4/udp.c120
-rw-r--r--net/ipv4/udp_offload.c101
-rw-r--r--net/ipv6/Kconfig5
-rw-r--r--net/ipv6/addrconf.c527
-rw-r--r--net/ipv6/addrconf_core.c41
-rw-r--r--net/ipv6/af_inet6.c65
-rw-r--r--net/ipv6/anycast.c33
-rw-r--r--net/ipv6/exthdrs.c55
-rw-r--r--net/ipv6/exthdrs_core.c2
-rw-r--r--net/ipv6/fib6_rules.c145
-rw-r--r--net/ipv6/ip6_fib.c639
-rw-r--r--net/ipv6/ip6_gre.c51
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_offload.c6
-rw-r--r--net/ipv6/ip6_output.c96
-rw-r--r--net/ipv6/ip6_vti.c2
-rw-r--r--net/ipv6/ip6mr.c24
-rw-r--r--net/ipv6/ndisc.c48
-rw-r--r--net/ipv6/netfilter/Kconfig10
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c2
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c2
-rw-r--r--net/ipv6/netfilter/ip6t_srh.c173
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c87
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c246
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c137
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c2
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c146
-rw-r--r--net/ipv6/netfilter/nft_chain_nat_ipv6.c51
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c2
-rw-r--r--net/ipv6/reassembly.c25
-rw-r--r--net/ipv6/route.c1895
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_iptunnel.c24
-rw-r--r--net/ipv6/seg6_local.c190
-rw-r--r--net/ipv6/sysctl_net_ipv6.c8
-rw-r--r--net/ipv6/tcp_ipv6.c8
-rw-r--r--net/ipv6/udp.c72
-rw-r--r--net/ipv6/udp_offload.c5
-rw-r--r--net/ipv6/xfrm6_policy.c2
-rw-r--r--net/ipv6/xfrm6_state.c6
-rw-r--r--net/l2tp/l2tp_debugfs.c20
-rw-r--r--net/l2tp/l2tp_ppp.c56
-rw-r--r--net/mac80211/cfg.c103
-rw-r--r--net/mac80211/driver-ops.h8
-rw-r--r--net/mac80211/ethtool.c13
-rw-r--r--net/mac80211/ht.c44
-rw-r--r--net/mac80211/ieee80211_i.h3
-rw-r--r--net/mac80211/main.c3
-rw-r--r--net/mac80211/mlme.c17
-rw-r--r--net/mac80211/rx.c40
-rw-r--r--net/mac80211/sta_info.c38
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/status.c2
-rw-r--r--net/mac80211/trace.h25
-rw-r--r--net/mac80211/tx.c45
-rw-r--r--net/mac80211/util.c6
-rw-r--r--net/ncsi/internal.h34
-rw-r--r--net/ncsi/ncsi-manage.c226
-rw-r--r--net/ncsi/ncsi-netlink.c21
-rw-r--r--net/ncsi/ncsi-rsp.c179
-rw-r--r--net/netfilter/Kconfig51
-rw-r--r--net/netfilter/Makefile12
-rw-r--r--net/netfilter/core.c102
-rw-r--r--net/netfilter/ipvs/Kconfig37
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c24
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c467
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c540
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c5
-rw-r--r--net/netfilter/nf_conncount.c36
-rw-r--r--net/netfilter/nf_conntrack_core.c92
-rw-r--r--net/netfilter/nf_conntrack_ftp.c3
-rw-r--r--net/netfilter/nf_conntrack_irc.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c13
-rw-r--r--net/netfilter/nf_conntrack_sane.c3
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_conntrack_tftp.c2
-rw-r--r--net/netfilter/nf_flow_table_core.c (renamed from net/netfilter/nf_flow_table.c)309
-rw-r--r--net/netfilter/nf_flow_table_inet.c3
-rw-r--r--net/netfilter/nf_flow_table_ip.c489
-rw-r--r--net/netfilter/nf_internals.h5
-rw-r--r--net/netfilter/nf_nat_core.c321
-rw-r--r--net/netfilter/nf_nat_helper.c2
-rw-r--r--net/netfilter/nf_nat_proto_common.c9
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c2
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c2
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c2
-rw-r--r--net/netfilter/nf_nat_proto_udp.c4
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c2
-rw-r--r--net/netfilter/nf_nat_redirect.c10
-rw-r--r--net/netfilter/nf_nat_sip.c2
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c1325
-rw-r--r--net/netfilter/nf_tables_core.c72
-rw-r--r--net/netfilter/nfnetlink.c44
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/nfnetlink_queue.c28
-rw-r--r--net/netfilter/nft_compat.c29
-rw-r--r--net/netfilter/nft_connlimit.c297
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_ct.c3
-rw-r--r--net/netfilter/nft_dynset.c16
-rw-r--r--net/netfilter/nft_exthdr.c23
-rw-r--r--net/netfilter/nft_flow_offload.c5
-rw-r--r--net/netfilter/nft_fwd_netdev.c146
-rw-r--r--net/netfilter/nft_hash.c127
-rw-r--r--net/netfilter/nft_immediate.c27
-rw-r--r--net/netfilter/nft_log.c92
-rw-r--r--net/netfilter/nft_lookup.c47
-rw-r--r--net/netfilter/nft_meta.c112
-rw-r--r--net/netfilter/nft_nat.c2
-rw-r--r--net/netfilter/nft_numgen.c158
-rw-r--r--net/netfilter/nft_objref.c4
-rw-r--r--net/netfilter/nft_rt.c22
-rw-r--r--net/netfilter/nft_set_bitmap.c34
-rw-r--r--net/netfilter/nft_set_hash.c174
-rw-r--r--net/netfilter/nft_set_rbtree.c109
-rw-r--r--net/netfilter/nft_socket.c144
-rw-r--r--net/netfilter/xt_NETMAP.c8
-rw-r--r--net/netfilter/xt_NFLOG.c15
-rw-r--r--net/netfilter/xt_REDIRECT.c2
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_nat.c72
-rw-r--r--net/netfilter/xt_osf.c202
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/nfc/netlink.c17
-rw-r--r--net/openvswitch/Kconfig3
-rw-r--r--net/openvswitch/conntrack.c555
-rw-r--r--net/openvswitch/conntrack.h9
-rw-r--r--net/openvswitch/datapath.c7
-rw-r--r--net/openvswitch/datapath.h3
-rw-r--r--net/packet/af_packet.c44
-rw-r--r--net/qrtr/Kconfig7
-rw-r--r--net/qrtr/Makefile2
-rw-r--r--net/qrtr/tun.c161
-rw-r--r--net/rfkill/core.c66
-rw-r--r--net/rxrpc/ar-internal.h2
-rw-r--r--net/rxrpc/call_event.c8
-rw-r--r--net/rxrpc/conn_event.c2
-rw-r--r--net/rxrpc/input.c10
-rw-r--r--net/sched/act_api.c20
-rw-r--r--net/sched/act_csum.c6
-rw-r--r--net/sched/cls_api.c443
-rw-r--r--net/sched/cls_basic.c24
-rw-r--r--net/sched/cls_bpf.c22
-rw-r--r--net/sched/cls_cgroup.c23
-rw-r--r--net/sched/cls_flow.c24
-rw-r--r--net/sched/cls_flower.c317
-rw-r--r--net/sched/cls_fw.c24
-rw-r--r--net/sched/cls_matchall.c21
-rw-r--r--net/sched/cls_route.c23
-rw-r--r--net/sched/cls_rsvp.h20
-rw-r--r--net/sched/cls_tcindex.c41
-rw-r--r--net/sched/cls_u32.c37
-rw-r--r--net/sched/sch_generic.c49
-rw-r--r--net/sched/sch_mq.c37
-rw-r--r--net/sctp/associola.c85
-rw-r--r--net/sctp/chunk.c12
-rw-r--r--net/sctp/output.c28
-rw-r--r--net/sctp/outqueue.c660
-rw-r--r--net/sctp/sm_make_chunk.c143
-rw-r--r--net/sctp/socket.c43
-rw-r--r--net/sctp/transport.c39
-rw-r--r--net/smc/af_smc.c803
-rw-r--r--net/smc/smc.h68
-rw-r--r--net/smc/smc_cdc.c101
-rw-r--r--net/smc/smc_cdc.h15
-rw-r--r--net/smc/smc_clc.c6
-rw-r--r--net/smc/smc_clc.h2
-rw-r--r--net/smc/smc_core.c199
-rw-r--r--net/smc/smc_core.h29
-rw-r--r--net/smc/smc_diag.c44
-rw-r--r--net/smc/smc_ib.c13
-rw-r--r--net/smc/smc_llc.c242
-rw-r--r--net/smc/smc_llc.h8
-rw-r--r--net/smc/smc_rx.c308
-rw-r--r--net/smc/smc_rx.h11
-rw-r--r--net/smc/smc_tx.c111
-rw-r--r--net/smc/smc_tx.h5
-rw-r--r--net/smc/smc_wr.c1
-rw-r--r--net/strparser/strparser.c13
-rw-r--r--net/tipc/bearer.c29
-rw-r--r--net/tipc/bearer.h3
-rw-r--r--net/tipc/name_table.c103
-rw-r--r--net/tipc/node.c33
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/socket.c13
-rw-r--r--net/tipc/udp_media.c4
-rw-r--r--net/tipc/udp_media.h14
-rw-r--r--net/tls/Kconfig10
-rw-r--r--net/tls/Makefile2
-rw-r--r--net/tls/tls_device.c766
-rw-r--r--net/tls/tls_device_fallback.c450
-rw-r--r--net/tls/tls_main.c139
-rw-r--r--net/tls/tls_sw.c143
-rw-r--r--net/wireless/core.c4
-rw-r--r--net/wireless/nl80211.c304
-rw-r--r--net/wireless/rdev-ops.h12
-rw-r--r--net/wireless/reg.c39
-rw-r--r--net/wireless/sme.c88
-rw-r--r--net/wireless/trace.h14
-rw-r--r--net/wireless/util.c11
-rw-r--r--net/xdp/Kconfig7
-rw-r--r--net/xdp/Makefile1
-rw-r--r--net/xdp/xdp_umem.c361
-rw-r--r--net/xdp/xdp_umem.h30
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c788
-rw-r--r--net/xdp/xsk_queue.c63
-rw-r--r--net/xdp/xsk_queue.h265
-rw-r--r--net/xfrm/xfrm_state.c9
332 files changed, 20336 insertions, 8128 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5505ee6ebdbe..73a65789271b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
}
int vlan_check_real_dev(struct net_device *real_dev,
- __be16 protocol, u16 vlan_id)
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack)
{
const char *name = real_dev->name;
if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
pr_info("VLANs not supported on %s\n", name);
+ NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
return -EOPNOTSUPP;
}
- if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL)
+ if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
return -EEXIST;
+ }
return 0;
}
@@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
if (vlan_id >= VLAN_VID_MASK)
return -ERANGE;
- err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id);
+ err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
+ NULL);
if (err < 0)
return err;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index e23aac3e4d37..44df1c3df02d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
int vlan_check_real_dev(struct net_device *real_dev,
- __be16 protocol, u16 vlan_id);
+ __be16 protocol, u16 vlan_id,
+ struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 236452ebbd9e..546af0e73ac3 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -215,7 +215,9 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
return 0;
}
-/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
+/* Flags are defined in the vlan_flags enum in
+ * include/uapi/linux/if_vlan.h file.
+ */
int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 6689c0b272a7..9b60c1e399e2 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
int err;
if (tb[IFLA_ADDRESS]) {
- if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
return -EINVAL;
- if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ }
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
return -EADDRNOTAVAIL;
+ }
}
- if (!data)
+ if (!data) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified");
return -EINVAL;
+ }
if (data[IFLA_VLAN_PROTOCOL]) {
switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
@@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
case htons(ETH_P_8021AD):
break;
default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol");
return -EPROTONOSUPPORT;
}
}
if (data[IFLA_VLAN_ID]) {
id = nla_get_u16(data[IFLA_VLAN_ID]);
- if (id >= VLAN_VID_MASK)
+ if (id >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id");
return -ERANGE;
+ }
}
if (data[IFLA_VLAN_FLAGS]) {
flags = nla_data(data[IFLA_VLAN_FLAGS]);
if ((flags->flags & flags->mask) &
~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
return -EINVAL;
+ }
}
err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map");
return err;
+ }
err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map");
return err;
+ }
return 0;
}
@@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
__be16 proto;
int err;
- if (!data[IFLA_VLAN_ID])
+ if (!data[IFLA_VLAN_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified");
return -EINVAL;
+ }
- if (!tb[IFLA_LINK])
+ if (!tb[IFLA_LINK]) {
+ NL_SET_ERR_MSG_MOD(extack, "link not specified");
return -EINVAL;
+ }
+
real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
- if (!real_dev)
+ if (!real_dev) {
+ NL_SET_ERR_MSG_MOD(extack, "link does not exist");
return -ENODEV;
+ }
if (data[IFLA_VLAN_PROTOCOL])
proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
@@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
vlan->flags = VLAN_FLAG_REORDER_HDR;
- err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);
+ err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id,
+ extack);
if (err < 0)
return err;
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 6ab36aea7727..eb9777f05755 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(v9fs_unregister_trans);
/**
* v9fs_get_trans_by_name - get transport with the matching name
- * @name: string identifying transport
+ * @s: string identifying transport
*
*/
struct p9_trans_module *v9fs_get_trans_by_name(char *s)
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
config INET
bool "TCP/IP networking"
@@ -201,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
endif
+source "net/bpfilter/Kconfig"
+
source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/rds/Kconfig"
@@ -407,6 +410,9 @@ config GRO_CELLS
bool
default n
+config SOCK_VALIDATE_XMIT
+ bool
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
@@ -423,6 +429,22 @@ config MAY_USE_DEVLINK
on MAY_USE_DEVLINK to ensure they do not cause link errors when
devlink is a loadable module and the driver using it is built-in.
+config PAGE_POOL
+ bool
+
+config FAILOVER
+ tristate "Generic failover module"
+ help
+ The failover module provides a generic interface for paravirtual
+ drivers to register a netdev and a set of ops with a failover
+ instance. The ops are used as event handlers that get called to
+ handle netdev register/unregister/link change/name change events
+ on slave pci ethernet devices with the same mac address as the
+ failover netdev. This enables paravirtual drivers to use a
+ VF as an accelerated low latency datapath. It also allows live
+ migration of VMs with direct attached VFs by failing over to the
+ paravirtual datapath when the VF is unplugged.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..13ec0d5415c7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,11 @@ obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
obj-$(CONFIG_NET) += ipv6/
+ifneq ($(CC_CAN_LINK),y)
+$(warning CC cannot link executables. Skipping bpfilter.)
+else
+obj-$(CONFIG_BPFILTER) += bpfilter/
+endif
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
@@ -85,3 +90,4 @@ obj-y += l3mdev/
endif
obj-$(CONFIG_QRTR) += qrtr/
obj-$(CONFIG_NET_NCSI) += ncsi/
+obj-$(CONFIG_XDP_SOCKETS) += xdp/
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index e4e2e02b7380..de8034d80623 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -35,7 +35,7 @@ config BATMAN_ADV
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol (experimental)"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
- default n
+ default y
help
This option enables the B.A.T.M.A.N. V protocol, the successor
of the currently used B.A.T.M.A.N. IV protocol. The main
@@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
- default y
+ default n
help
Enable this to export routing related debug tables via debugfs.
The information for each soft-interface and used hard-interface can be
found under batman_adv/
- If unsure, say Y.
+ If unsure, say N.
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 28687493599f..71c20c1d4002 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -127,7 +127,20 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
rtnl_lock();
ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
rtnl_unlock();
- if (ret == 0) {
+
+ /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
+ * tend to initialize the interface throughput with some value for the
+ * sake of having a throughput number to export via ethtool. This
+ * exported throughput leaves batman-adv to conclude the interface
+ * throughput is genuine (reflecting reality), thus no measurements
+ * are necessary.
+ *
+ * Based on the observation that those interface types also tend to set
+ * the link auto-negotiation to 'off', batman-adv shall check this
+ * setting to differentiate between genuine link throughput information
+ * and placeholders installed by virtual interfaces.
+ */
+ if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
/* link characteristics might change over time */
if (link_settings.base.duplex == DUPLEX_FULL)
hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 057a28a9fe88..8da3c9336111 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.1"
+#define BATADV_SOURCE_VERSION "2018.2"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index a35f597e8c8b..86725d792e15 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -815,9 +815,6 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
if (!atomic_read(&bat_priv->multicast_mode))
return -EINVAL;
- if (atomic_read(&bat_priv->mcast.num_disabled))
- return -EINVAL;
-
switch (ntohs(ethhdr->h_proto)) {
case ETH_P_IP:
return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
@@ -1183,33 +1180,23 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
u8 mcast_flags = BATADV_NO_FLAGS;
- bool orig_initialized;
if (orig_mcast_enabled && tvlv_value &&
tvlv_value_len >= sizeof(mcast_flags))
mcast_flags = *(u8 *)tvlv_value;
+ if (!orig_mcast_enabled) {
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ }
+
spin_lock_bh(&orig->mcast_handler_lock);
- orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig->capa_initialized);
- /* If mcast support is turned on decrease the disabled mcast node
- * counter only if we had increased it for this node before. If this
- * is a completely new orig_node no need to decrease the counter.
- */
if (orig_mcast_enabled &&
!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
- if (orig_initialized)
- atomic_dec(&bat_priv->mcast.num_disabled);
set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
- /* If mcast support is being switched off or if this is an initial
- * OGM without mcast support then increase the disabled mcast
- * node counter.
- */
} else if (!orig_mcast_enabled &&
- (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) ||
- !orig_initialized)) {
- atomic_inc(&bat_priv->mcast.num_disabled);
+ test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
}
@@ -1595,10 +1582,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
spin_lock_bh(&orig->mcast_handler_lock);
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) &&
- test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized))
- atomic_dec(&bat_priv->mcast.num_disabled);
-
batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index edeffcb9f3a2..1485263a348b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -188,8 +188,8 @@ static void batadv_interface_set_rx_mode(struct net_device *dev)
{
}
-static int batadv_interface_tx(struct sk_buff *skb,
- struct net_device *soft_iface)
+static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
+ struct net_device *soft_iface)
{
struct ethhdr *ethhdr;
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -796,7 +796,6 @@ static int batadv_softif_init_late(struct net_device *dev)
bat_priv->mcast.querier_ipv6.shadowing = false;
bat_priv->mcast.flags = BATADV_NO_FLAGS;
atomic_set(&bat_priv->multicast_mode, 1);
- atomic_set(&bat_priv->mcast.num_disabled, 0);
atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 476b052ad982..360357f83f20 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -215,10 +215,12 @@ struct batadv_hard_iface {
struct batadv_hard_iface_bat_v bat_v;
#endif
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
*/
struct dentry *debug_dir;
+#endif
/**
* @neigh_list: list of unique single hop neighbors via this interface
@@ -1160,13 +1162,13 @@ struct batadv_priv_dat {
*/
struct batadv_mcast_querier_state {
/** @exists: whether a querier exists in the mesh */
- bool exists;
+ unsigned char exists:1;
/**
* @shadowing: if a querier exists, whether it is potentially shadowing
* multicast listeners (i.e. querier is behind our own bridge segment)
*/
- bool shadowing;
+ unsigned char shadowing:1;
};
/**
@@ -1207,13 +1209,10 @@ struct batadv_priv_mcast {
u8 flags;
/** @enabled: whether the multicast tvlv is currently enabled */
- bool enabled;
+ unsigned char enabled:1;
/** @bridged: whether the soft interface has a bridge on top */
- bool bridged;
-
- /** @num_disabled: number of nodes that have no mcast tvlv */
- atomic_t num_disabled;
+ unsigned char bridged:1;
/**
* @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
@@ -1245,10 +1244,12 @@ struct batadv_priv_nc {
/** @work: work queue callback item for cleanup */
struct delayed_work work;
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
*/
struct dentry *debug_dir;
+#endif
/**
* @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
@@ -1392,7 +1393,7 @@ struct batadv_tp_vars {
atomic_t dup_acks;
/** @fast_recovery: true if in Fast Recovery mode */
- bool fast_recovery;
+ unsigned char fast_recovery:1;
/** @recover: last sent seqno when entering Fast Recovery */
u32 recover;
@@ -1601,8 +1602,10 @@ struct batadv_priv {
/** @mesh_obj: kobject for sysfs mesh subdirectory */
struct kobject *mesh_obj;
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/** @debug_dir: dentry for debugfs batman-adv subdirectory */
struct dentry *debug_dir;
+#endif
/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
struct hlist_head forw_bat_list;
@@ -2049,10 +2052,10 @@ struct batadv_skb_cb {
* @decoded: Marks a skb as decoded, which is checked when searching for
* coding opportunities in network-coding.c
*/
- bool decoded;
+ unsigned char decoded:1;
/** @num_bcasts: Counter for broadcast packet retransmissions */
- unsigned int num_bcasts;
+ unsigned char num_bcasts;
};
/**
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 40d260f2bea5..1dec33790198 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -76,19 +76,15 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
{
struct hci_dev *hdev = file->private_data;
struct sk_buff *skb;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
if (!test_bit(HCI_UP, &hdev->flags))
return -ENETDOWN;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
return -EALREADY;
@@ -135,17 +131,12 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
int err;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
/* When the diagnostic flags are not persistent and the transport
* is not active or in user channel operation, then there is no need
@@ -3422,6 +3413,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
return 0;
}
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param)
+{
+ struct sk_buff *skb;
+
+ if (hci_opcode_ogf(opcode) != 0x3f) {
+ /* A controller receiving a command shall respond with either
+ * a Command Status Event or a Command Complete Event.
+ * Therefore, all standard HCI commands must be sent via the
+ * standard API, using hci_send_cmd or hci_cmd_sync helpers.
+ * Some vendors do not comply with this rule for vendor-specific
+ * commands and do not return any event. We want to support
+ * unresponded commands for such cases only.
+ */
+ bt_dev_err(hdev, "unresponded command not supported");
+ return -EINVAL;
+ }
+
+ skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ if (!skb) {
+ bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+ opcode);
+ return -ENOMEM;
+ }
+
+ hci_send_frame(hdev, skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(__hci_cmd_send);
+
/* Get data from the previously sent command */
void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
{
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 418b76e557b0..0d8ab5b3c177 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -47,19 +47,15 @@ static ssize_t __name ## _write(struct file *file, \
size_t count, loff_t *ppos) \
{ \
struct hci_dev *hdev = file->private_data; \
- char buf[32]; \
- size_t buf_size = min(count, (sizeof(buf) - 1)); \
bool enable; \
+ int err; \
\
if (test_bit(HCI_UP, &hdev->flags)) \
return -EBUSY; \
\
- if (copy_from_user(buf, user_buf, buf_size)) \
- return -EFAULT; \
- \
- buf[buf_size] = '\0'; \
- if (strtobool(buf, &enable)) \
- return -EINVAL; \
+ err = kstrtobool_from_user(user_buf, count, &enable); \
+ if (err) \
+ return err; \
\
if (enable == test_bit(__quirk, &hdev->quirks)) \
return -EALREADY; \
@@ -658,19 +654,15 @@ static ssize_t force_static_address_write(struct file *file,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
if (test_bit(HCI_UP, &hdev->flags))
return -EBUSY;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
return -EALREADY;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 139707cd9d35..235b5aaab23d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
struct hci_ev_le_advertising_info *ev = ptr;
s8 rssi;
- rssi = ev->data[ev->length];
- process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
- ev->bdaddr_type, NULL, 0, rssi,
- ev->data, ev->length);
+ if (ev->length <= HCI_MAX_AD_LENGTH) {
+ rssi = ev->data[ev->length];
+ process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
+ ev->bdaddr_type, NULL, 0, rssi,
+ ev->data, ev->length);
+ } else {
+ bt_dev_err(hdev, "Dropping invalid advertising data");
+ }
ptr += sizeof(*ev) + ev->length + 1;
}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 66c0781773df..e44d34734834 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err)
struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
const void *param, u8 event, u32 timeout)
{
- DECLARE_WAITQUEUE(wait, current);
struct hci_request req;
struct sk_buff *skb;
int err = 0;
@@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
hdev->req_status = HCI_REQ_PEND;
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0) {
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
+ if (err < 0)
return ERR_PTR(err);
- }
- schedule_timeout(timeout);
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND, timeout);
- remove_wait_queue(&hdev->req_wait_q, &wait);
-
- if (signal_pending(current))
+ if (err == -ERESTARTSYS)
return ERR_PTR(-EINTR);
switch (hdev->req_status) {
@@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
unsigned long opt, u32 timeout, u8 *hci_status)
{
struct hci_request req;
- DECLARE_WAITQUEUE(wait, current);
int err = 0;
BT_DBG("%s start", hdev->name);
@@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
return err;
}
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
err = hci_req_run_skb(&req, hci_req_sync_complete);
if (err < 0) {
hdev->req_status = 0;
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
-
/* ENODATA means the HCI request command queue is empty.
* This can happen when a request with conditionals doesn't
* trigger any commands to be sent. This is normal behavior
@@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
return err;
}
- schedule_timeout(timeout);
-
- remove_wait_queue(&hdev->req_wait_q, &wait);
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND, timeout);
- if (signal_pending(current))
+ if (err == -ERESTARTSYS)
return -EINTR;
switch (hdev->req_status) {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index a2ddae2f37d7..ae91e2d40056 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3315,16 +3315,12 @@ static ssize_t force_bredr_smp_write(struct file *file,
size_t count, loff_t *ppos)
{
struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf)-1));
bool enable;
+ int err;
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &enable))
- return -EINVAL;
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return -EALREADY;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ced48662c1f..68c3578343b4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
xdp.rxq = &rxqueue->xdp_rxq;
retval = bpf_test_run(prog, &xdp, repeat, &duration);
- if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+ if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
+ xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
kfree(data);
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..a948b072c28f
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
+menuconfig BPFILTER
+ bool "BPF based packet filtering framework (BPFILTER)"
+ default n
+ depends on NET && BPF && INET
+ help
+ This builds experimental bpfilter framework that is aiming to
+ provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+ tristate "bpfilter kernel module with user mode helper"
+ default m
+ help
+ This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..aafa72001fcd
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
+HOSTCC := $(CC)
+
+ifeq ($(CONFIG_BPFILTER_UMH), y)
+# builtin bpfilter_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+HOSTLDFLAGS += -static
+endif
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+ cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+ $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+ -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+ --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+ $(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..b13d058f8c34
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+/* since ip_getsockopt() can run in parallel, serialize access to umh */
+static DEFINE_MUTEX(bpfilter_lock);
+
+static void shutdown_umh(struct umh_info *info)
+{
+ struct task_struct *tsk;
+
+ tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+ if (tsk)
+ force_sig(SIGKILL, tsk);
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+}
+
+static void __stop_umh(void)
+{
+ if (IS_ENABLED(CONFIG_INET) &&
+ bpfilter_process_sockopt) {
+ bpfilter_process_sockopt = NULL;
+ shutdown_umh(&info);
+ }
+}
+
+static void stop_umh(void)
+{
+ mutex_lock(&bpfilter_lock);
+ __stop_umh();
+ mutex_unlock(&bpfilter_lock);
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set)
+{
+ struct mbox_request req;
+ struct mbox_reply reply;
+ loff_t pos;
+ ssize_t n;
+ int ret;
+
+ req.is_set = is_set;
+ req.pid = current->pid;
+ req.cmd = optname;
+ req.addr = (long)optval;
+ req.len = optlen;
+ mutex_lock(&bpfilter_lock);
+ n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+ if (n != sizeof(req)) {
+ pr_err("write fail %zd\n", n);
+ __stop_umh();
+ ret = -EFAULT;
+ goto out;
+ }
+ pos = 0;
+ n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+ if (n != sizeof(reply)) {
+ pr_err("read fail %zd\n", n);
+ __stop_umh();
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = reply.status;
+out:
+ mutex_unlock(&bpfilter_lock);
+ return ret;
+}
+
+static int __init load_umh(void)
+{
+ int err;
+
+ /* fork usermode process */
+ err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+ if (err)
+ return err;
+ pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
+
+ /* health check that usermode process started correctly */
+ if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+ stop_umh();
+ return -EFAULT;
+ }
+ if (IS_ENABLED(CONFIG_INET))
+ bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+
+ return 0;
+}
+
+static void __exit fini_umh(void)
+{
+ stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..1317f108df8a
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+ switch (cmd->cmd) {
+ case 0:
+ return 0;
+ default:
+ break;
+ }
+ return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+ return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+ while (1) {
+ struct mbox_request req;
+ struct mbox_reply reply;
+ int n;
+
+ n = read(0, &req, sizeof(req));
+ if (n != sizeof(req)) {
+ dprintf(debug_fd, "invalid request %d\n", n);
+ return;
+ }
+
+ reply.status = req.is_set ?
+ handle_set_cmd(&req) :
+ handle_get_cmd(&req);
+
+ n = write(1, &reply, sizeof(reply));
+ if (n != sizeof(reply)) {
+ dprintf(debug_fd, "reply failed %d\n", n);
+ return;
+ }
+ }
+}
+
+int main(void)
+{
+ debug_fd = open("/dev/console", 00000002);
+ dprintf(debug_fd, "Started bpfilter\n");
+ loop();
+ close(debug_fd);
+ return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFILTER_MSGFMT_H
+
+struct mbox_request {
+ __u64 addr;
+ __u32 len;
+ __u32 is_set;
+ __u32 cmd;
+ __u32 pid;
+};
+
+struct mbox_reply {
+ __u32 status;
+};
+
+#endif
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 671d13c10f6f..b0a0b82e2d91 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
+ bool notified = false;
bool changed_addr;
int err;
@@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
case NETDEV_CHANGE:
- br_port_carrier_check(p);
+ br_port_carrier_check(p, &notified);
break;
case NETDEV_FEAT_CHANGE:
@@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
case NETDEV_DOWN:
spin_lock_bh(&br->lock);
- if (br->dev->flags & IFF_UP)
+ if (br->dev->flags & IFF_UP) {
br_stp_disable_port(p);
+ notified = true;
+ }
spin_unlock_bh(&br->lock);
break;
@@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
if (netif_running(br->dev) && netif_oper_up(dev)) {
spin_lock_bh(&br->lock);
br_stp_enable_port(p);
+ notified = true;
spin_unlock_bh(&br->lock);
}
break;
@@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
}
/* Events that may cause spanning tree to refresh */
- if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
- event == NETDEV_CHANGE || event == NETDEV_DOWN)
+ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+ event == NETDEV_CHANGE || event == NETDEV_DOWN))
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
return NOTIFY_DONE;
@@ -141,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_ADD_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err) {
err = notifier_from_errno(err);
break;
@@ -152,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_del(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, false);
if (err)
err = notifier_from_errno(err);
break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..b19e3104afd6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *, int);
+ const struct net_bridge_fdb_entry *, int, bool);
int __init br_fdb_init(void)
{
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
return fdb;
}
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_device *dev = NULL;
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(br_dev))
+ return NULL;
+
+ br = netdev_priv(br_dev);
+ f = br_fdb_find(br, addr, vid);
+ if (f && f->dst)
+ dev = f->dst->dev;
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
const unsigned char *addr,
__u16 vid)
@@ -173,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
}
}
-static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
+ bool swdev_notify)
{
trace_fdb_delete(br, f);
@@ -183,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
hlist_del_init_rcu(&f->fdb_node);
rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
br_fdb_rht_params);
- fdb_notify(br, f, RTM_DELNEIGH);
+ fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
call_rcu(&f->rcu, fdb_rcu_free);
}
@@ -219,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br,
return;
}
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
void br_fdb_find_delete_local(struct net_bridge *br,
@@ -334,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work)
} else {
spin_lock_bh(&br->hash_lock);
if (!hlist_unhashed(&f->fdb_node))
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
spin_unlock_bh(&br->hash_lock);
}
}
@@ -354,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br)
spin_lock_bh(&br->hash_lock);
hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
if (!f->is_static)
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -383,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
if (f->is_local)
fdb_delete_local(br, p, f);
else
- fdb_delete(br, f);
+ fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
}
@@ -509,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return 0;
br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
source ? source->dev->name : br->dev->name, addr, vid);
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
}
fdb = fdb_create(br, source, addr, vid, 1, 1);
@@ -517,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return -ENOMEM;
fdb_add_hw_addr(br, addr);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
return 0;
}
@@ -572,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
if (unlikely(fdb_modified)) {
trace_br_fdb_update(br, source, addr, vid, added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
}
} else {
@@ -583,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
fdb->added_by_user = 1;
trace_br_fdb_update(br, source, addr, vid,
added_by_user);
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
/* else we lose race and someone else inserts
* it first, don't bother updating
@@ -665,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void)
}
static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb, int type)
+ const struct net_bridge_fdb_entry *fdb, int type,
+ bool swdev_notify)
{
struct net *net = dev_net(br->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- br_switchdev_fdb_notify(fdb, type);
+ if (swdev_notify)
+ br_switchdev_fdb_notify(fdb, type);
skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
@@ -810,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
fdb->used = jiffies;
if (modified) {
fdb->updated = jiffies;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
return 0;
@@ -834,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
rcu_read_unlock();
local_bh_enable();
} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
- err = br_fdb_external_learn_add(br, p, addr, vid);
+ err = br_fdb_external_learn_add(br, p, addr, vid, true);
} else {
spin_lock_bh(&br->hash_lock);
err = fdb_add_entry(br, p, addr, ndm->ndm_state,
@@ -923,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
if (!fdb || fdb->dst != p)
return -ENOENT;
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, true);
return 0;
}
@@ -1043,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -1061,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
goto err_unlock;
}
fdb->added_by_external_learn = 1;
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
fdb->updated = jiffies;
@@ -1080,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
}
if (modified)
- fdb_notify(br, fdb, RTM_NEWNEIGH);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
}
err_unlock:
@@ -1090,7 +1116,8 @@ err_unlock:
}
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
int err = 0;
@@ -1099,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
fdb = br_fdb_find(br, addr, vid);
if (fdb && fdb->added_by_external_learn)
- fdb_delete(br, fdb);
+ fdb_delete(br, fdb, swdev_notify);
else
err = -ENOENT;
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index b4eed113d2ec..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
- nbp_switchdev_allowed_egress(p, skb);
+ nbp_switchdev_allowed_egress(p, skb) &&
+ !br_skb_isolated(p, skb);
}
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -274,8 +275,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge_port *port, *lport, *rport;
lport = p ? p->port : NULL;
- rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
- NULL;
+ rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
if ((unsigned long)lport > (unsigned long)rport) {
port = lport;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 5bb6681fa91e..05e42d86882d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev)
/* Check for port carrier transitions. */
-void br_port_carrier_check(struct net_bridge_port *p)
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
{
struct net_device *dev = p->dev;
struct net_bridge *br = p->br;
@@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p)
netif_running(dev) && netif_oper_up(dev))
p->path_cost = port_cost(dev);
+ *notified = false;
if (!netif_running(br->dev))
return;
spin_lock_bh(&br->lock);
if (netif_running(dev) && netif_oper_up(dev)) {
- if (p->state == BR_STATE_DISABLED)
+ if (p->state == BR_STATE_DISABLED) {
br_stp_enable_port(p);
+ *notified = true;
+ }
} else {
- if (p->state != BR_STATE_DISABLED)
+ if (p->state != BR_STATE_DISABLED) {
br_stp_disable_port(p);
+ *notified = true;
+ }
}
spin_unlock_bh(&br->lock);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+ BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
if (IS_ENABLED(CONFIG_INET) &&
(skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */
+ nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
+ nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
+ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
BR_VLAN_TUNNEL)) ||
nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
- !!(p->flags & BR_NEIGH_SUPPRESS)))
+ !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+ nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+ [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
};
/* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (err)
return err;
+ err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+ if (err)
+ return err;
+
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..5216a524b537 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
#endif
bool proxyarp_replied;
+ bool src_port_isolated;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool vlan_filtered;
@@ -553,9 +554,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid,
+ bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
@@ -572,8 +575,16 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+ const struct sk_buff *skb)
+{
+ return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+ (to->flags & BR_ISOLATED);
+}
+
/* br_if.c */
-void br_port_carrier_check(struct net_bridge_port *p);
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
int br_add_bridge(struct net *net, const char *name);
int br_del_bridge(struct net *net, const char *name);
int br_add_if(struct net_bridge *br, struct net_device *dev,
@@ -594,11 +605,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
return rcu_dereference(dev->rx_handler) == br_handle_frame;
}
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
{
return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
}
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+ return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
/* br_ioctl.c */
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
@@ -1117,6 +1139,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long mask);
void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
int type);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
@@ -1146,6 +1170,17 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
return 0;
}
+static inline int br_switchdev_port_vlan_add(struct net_device *dev,
+ u16 vid, u16 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..d77f807420c4 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
static void
br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
- u16 vid, struct net_device *dev)
+ u16 vid, struct net_device *dev,
+ bool added_by_user)
{
struct switchdev_notifier_fdb_info info;
unsigned long notifier_type;
info.addr = mac;
info.vid = vid;
+ info.added_by_user = added_by_user;
notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
call_switchdev_notifiers(notifier_type, dev, &info.info);
}
@@ -116,19 +118,46 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
- if (!fdb->added_by_user || !fdb->dst)
+ if (!fdb->dst)
return;
switch (type) {
case RTM_DELNEIGH:
br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
case RTM_NEWNEIGH:
br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
fdb->key.vlan_id,
- fdb->dst->dev);
+ fdb->dst->dev,
+ fdb->added_by_user);
break;
}
}
+
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = flags,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
+
+ return switchdev_port_obj_add(dev, &v.obj);
+}
+
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
+
+ return switchdev_port_obj_del(dev, &v.obj);
+}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
&brport_attr_broadcast_flood,
&brport_attr_group_fwd_mask,
&brport_attr_neigh_suppress,
+ &brport_attr_isolated,
NULL
};
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..7df269092103 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -82,19 +82,12 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
u16 vid, u16 flags)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q add.
*/
- err = switchdev_port_obj_add(dev, &v.obj);
+ err = br_switchdev_port_vlan_add(dev, vid, flags);
if (err == -EOPNOTSUPP)
return vlan_vid_add(dev, br->vlan_proto, vid);
return err;
@@ -130,18 +123,12 @@ static void __vlan_del_list(struct net_bridge_vlan *v)
static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
u16 vid)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .vid_begin = vid,
- .vid_end = vid,
- };
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q del.
*/
- err = switchdev_port_obj_del(dev, &v.obj);
+ err = br_switchdev_port_vlan_del(dev, vid);
if (err == -EOPNOTSUPP) {
vlan_vid_del(dev, br->vlan_proto, vid);
return 0;
@@ -259,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
goto out_filt;
v->brvlan = masterv;
v->stats = masterv->stats;
+ } else {
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
}
/* Add the dev mac and count the vlan only if it's usable */
@@ -294,6 +285,8 @@ out_filt:
br_vlan_put_master(masterv);
v->brvlan = NULL;
}
+ } else {
+ br_switchdev_port_vlan_del(dev, v->vid);
}
goto out;
@@ -319,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v)
err = __vlan_vid_del(p->dev, p->br, v->vid);
if (err)
goto out;
+ } else {
+ err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ err = 0;
}
if (br_vlan_should_use(v)) {
@@ -564,6 +562,48 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
return false;
}
+static int br_vlan_add_existing(struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *vlan,
+ u16 flags, bool *changed)
+{
+ int err;
+
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ if (!br_vlan_is_brentry(vlan)) {
+ /* Trying to change flags of non-existent bridge vlan */
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
+ err = -EINVAL;
+ goto err_flags;
+ }
+ /* It was only kept for port vlans, now make it real */
+ err = br_fdb_insert(br, NULL, br->dev->dev_addr,
+ vlan->vid);
+ if (err) {
+ br_err(br, "failed to insert local address into bridge forwarding table\n");
+ goto err_fdb_insert;
+ }
+
+ refcount_inc(&vlan->refcnt);
+ vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+ vg->num_vlans++;
+ *changed = true;
+ }
+
+ if (__vlan_add_flags(vlan, flags))
+ *changed = true;
+
+ return 0;
+
+err_fdb_insert:
+err_flags:
+ br_switchdev_port_vlan_del(br->dev, vlan->vid);
+ return err;
+}
+
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
* changed must be true only if the vlan was created or updated
@@ -579,28 +619,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
*changed = false;
vg = br_vlan_group(br);
vlan = br_vlan_find(vg, vid);
- if (vlan) {
- if (!br_vlan_is_brentry(vlan)) {
- /* Trying to change flags of non-existent bridge vlan */
- if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
- return -EINVAL;
- /* It was only kept for port vlans, now make it real */
- ret = br_fdb_insert(br, NULL, br->dev->dev_addr,
- vlan->vid);
- if (ret) {
- br_err(br, "failed insert local address into bridge forwarding table\n");
- return ret;
- }
- refcount_inc(&vlan->refcnt);
- vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
- vg->num_vlans++;
- *changed = true;
- }
- if (__vlan_add_flags(vlan, flags))
- *changed = true;
-
- return 0;
- }
+ if (vlan)
+ return br_vlan_add_existing(br, vg, vlan, flags, changed);
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
if (!vlan)
@@ -1053,13 +1073,6 @@ err_vlan_enabled:
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
bool *changed)
{
- struct switchdev_obj_port_vlan v = {
- .obj.orig_dev = port->dev,
- .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
- };
struct net_bridge_vlan *vlan;
int ret;
@@ -1069,7 +1082,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
/* Pass the flags to the hardware bridge */
- ret = switchdev_port_obj_add(port->dev, &v.obj);
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
if (ret && ret != -EOPNOTSUPP)
return ret;
*changed = __vlan_add_flags(vlan, flags);
@@ -1149,3 +1162,44 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
stats->tx_packets += txpackets;
}
}
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+
+ ASSERT_RTNL();
+ if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f212447794bd..9a0159aebe1a 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE
bool "Ethernet Bridge nf_tables support"
if NF_TABLES_BRIDGE
-
-config NFT_BRIDGE_META
- tristate "Netfilter nf_table bridge meta support"
- depends on NFT_META
- help
- Add support for bridge dedicated meta key.
-
config NFT_BRIDGE_REJECT
tristate "Netfilter nf_tables bridge reject support"
depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 4bc758dd4a8c..9b868861f21a 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
# Makefile for the netfilter modules for Link Layer filtering on a bridge.
#
-obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
# packet logging
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0e27c51331fb..28f68a2ec911 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
{
par->match = m->u.match;
par->matchinfo = m->data;
- return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+ return !m->u.match->match(skb, par);
}
static inline int
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
return (void *)entry + entry->next_offset;
}
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+ return ebt_get_target((struct ebt_entry *)e);
+}
+
/* Do some firewalling */
unsigned int ebt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
*/
EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
- t = (struct ebt_entry_target *)
- (((char *)point) + point->target_offset);
+ t = ebt_get_target_c(point);
/* standard target */
if (!t->u.target->target)
verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -343,6 +348,16 @@ find_table_lock(struct net *net, const char *name, int *error,
"ebtable_", error, mutex);
}
+static inline void ebt_free_table_info(struct ebt_table_info *info)
+{
+ int i;
+
+ if (info->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(info->chainstack[i]);
+ vfree(info->chainstack);
+ }
+}
static inline int
ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
unsigned int *cnt)
@@ -627,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
return 1;
EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target(e);
par.net = net;
par.target = t->u.target;
@@ -706,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
if (ret != 0)
goto cleanup_watchers;
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target(e);
gap = e->next_offset - e->target_offset;
target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -779,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
if (pos == nentries)
continue;
}
- t = (struct ebt_entry_target *)
- (((char *)e) + e->target_offset);
+ t = ebt_get_target_c(e);
if (strcmp(t->u.name, EBT_STANDARD_TARGET))
goto letscontinue;
if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -975,7 +989,7 @@ static void get_counters(const struct ebt_counter *oldcounters,
static int do_replace_finish(struct net *net, struct ebt_replace *repl,
struct ebt_table_info *newinfo)
{
- int ret, i;
+ int ret;
struct ebt_counter *counterstmp = NULL;
/* used to be able to unlock earlier */
struct ebt_table_info *table;
@@ -1051,13 +1065,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
ebt_cleanup_entry, net, NULL);
vfree(table->entries);
- if (table->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->chainstack[i]);
- vfree(table->chainstack);
- }
+ ebt_free_table_info(table);
vfree(table);
-
vfree(counterstmp);
#ifdef CONFIG_AUDIT
@@ -1078,11 +1087,7 @@ free_iterate:
free_counterstmp:
vfree(counterstmp);
/* can be initialized in translate_table() */
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
+ ebt_free_table_info(newinfo);
return ret;
}
@@ -1147,8 +1152,6 @@ free_newinfo:
static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
{
- int i;
-
mutex_lock(&ebt_mutex);
list_del(&table->list);
mutex_unlock(&ebt_mutex);
@@ -1157,11 +1160,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
if (table->private->nentries)
module_put(table->me);
vfree(table->private->entries);
- if (table->private->chainstack) {
- for_each_possible_cpu(i)
- vfree(table->private->chainstack[i]);
- vfree(table->private->chainstack);
- }
+ ebt_free_table_info(table->private);
vfree(table->private);
kfree(table);
}
@@ -1263,11 +1262,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
free_unlock:
mutex_unlock(&ebt_mutex);
free_chainstack:
- if (newinfo->chainstack) {
- for_each_possible_cpu(i)
- vfree(newinfo->chainstack[i]);
- vfree(newinfo->chainstack);
- }
+ ebt_free_table_info(newinfo);
vfree(newinfo->entries);
free_newinfo:
vfree(newinfo);
@@ -1405,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
return -EFAULT;
hlp = ubase + (((char *)e + e->target_offset) - base);
- t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ t = ebt_get_target_c(e);
ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
if (ret != 0)
@@ -1746,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
return ret;
target_offset = e->target_offset - (origsize - *size);
- t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+ t = ebt_get_target(e);
ret = compat_target_to_user(t, dstptr, size);
if (ret)
@@ -1794,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
EBT_MATCH_ITERATE(e, compat_calc_match, &off);
EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
- t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+ t = ebt_get_target_c(e);
off += xt_compat_target_offset(t->u.target);
off += ebt_compat_entry_padsize();
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
deleted file mode 100644
index bb63c9aed55d..000000000000
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2014 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_meta.h>
-
-#include "../br_private.h"
-
-static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- const struct nft_meta *priv = nft_expr_priv(expr);
- const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
- u32 *dest = &regs->data[priv->dreg];
- const struct net_bridge_port *p;
-
- switch (priv->key) {
- case NFT_META_BRI_IIFNAME:
- if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
- goto err;
- break;
- case NFT_META_BRI_OIFNAME:
- if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
- goto err;
- break;
- default:
- goto out;
- }
-
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
-out:
- return nft_meta_get_eval(expr, regs, pkt);
-err:
- regs->verdict.code = NFT_BREAK;
-}
-
-static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_meta *priv = nft_expr_priv(expr);
- unsigned int len;
-
- priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
- switch (priv->key) {
- case NFT_META_BRI_IIFNAME:
- case NFT_META_BRI_OIFNAME:
- len = IFNAMSIZ;
- break;
- default:
- return nft_meta_get_init(ctx, expr, tb);
- }
-
- priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
-}
-
-static struct nft_expr_type nft_meta_bridge_type;
-static const struct nft_expr_ops nft_meta_bridge_get_ops = {
- .type = &nft_meta_bridge_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
- .eval = nft_meta_bridge_get_eval,
- .init = nft_meta_bridge_get_init,
- .dump = nft_meta_get_dump,
-};
-
-static const struct nft_expr_ops nft_meta_bridge_set_ops = {
- .type = &nft_meta_bridge_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
- .eval = nft_meta_set_eval,
- .init = nft_meta_set_init,
- .destroy = nft_meta_set_destroy,
- .dump = nft_meta_set_dump,
- .validate = nft_meta_set_validate,
-};
-
-static const struct nft_expr_ops *
-nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
- const struct nlattr * const tb[])
-{
- if (tb[NFTA_META_KEY] == NULL)
- return ERR_PTR(-EINVAL);
-
- if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
- return ERR_PTR(-EINVAL);
-
- if (tb[NFTA_META_DREG])
- return &nft_meta_bridge_get_ops;
-
- if (tb[NFTA_META_SREG])
- return &nft_meta_bridge_set_ops;
-
- return ERR_PTR(-EINVAL);
-}
-
-static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
- .family = NFPROTO_BRIDGE,
- .name = "meta",
- .select_ops = nft_meta_bridge_select_ops,
- .policy = nft_meta_policy,
- .maxattr = NFTA_META_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_meta_bridge_module_init(void)
-{
- return nft_register_expr(&nft_meta_bridge_type);
-}
-
-static void __exit nft_meta_bridge_module_exit(void)
-{
- nft_unregister_expr(&nft_meta_bridge_type);
-}
-
-module_init(nft_meta_bridge_module_init);
-module_exit(nft_meta_bridge_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
fib_notifier.o xdp.o
obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
@@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c149238a4ce..6e18242a1cae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
+EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- };
+ }
#undef N
return "UNKNOWN_NETDEV_EVENT";
}
@@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
EXPORT_SYMBOL(call_netdevice_notifiers);
#ifdef CONFIG_NET_INGRESS
-static struct static_key ingress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
void net_inc_ingress_queue(void)
{
- static_key_slow_inc(&ingress_needed);
+ static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
void net_dec_ingress_queue(void)
{
- static_key_slow_dec(&ingress_needed);
+ static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
#ifdef CONFIG_NET_EGRESS
-static struct static_key egress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
void net_inc_egress_queue(void)
{
- static_key_slow_inc(&egress_needed);
+ static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
void net_dec_egress_queue(void)
{
- static_key_slow_dec(&egress_needed);
+ static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static struct static_key netstamp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
#ifdef HAVE_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)
wanted = atomic_add_return(deferred, &netstamp_wanted);
if (wanted > 0)
- static_key_enable(&netstamp_needed);
+ static_branch_enable(&netstamp_needed_key);
else
- static_key_disable(&netstamp_needed);
+ static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
@@ -1818,7 +1819,7 @@ void net_enable_timestamp(void)
atomic_inc(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_inc(&netstamp_needed);
+ static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1838,7 +1839,7 @@ void net_disable_timestamp(void)
atomic_dec(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_dec(&netstamp_needed);
+ static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
@@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
- if (static_key_false(&netstamp_needed))
+ if (static_branch_unlikely(&netstamp_needed_key))
__net_timestamp(skb);
}
-#define net_timestamp_check(COND, SKB) \
- if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
- } \
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch_unlikely(&netstamp_needed_key)) { \
+ if ((COND) && !(SKB)->tstamp) \
+ __net_timestamp(SKB); \
+ } \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
@@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
- u16 qcount = num_tx_queues;
+ u16 qcount = dev->real_num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
return hash;
}
@@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
@@ -3095,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
@@ -3223,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_DROP;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- __qdisc_run(q);
+ qdisc_run(q);
}
if (unlikely(to_free))
@@ -3511,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
- if (static_key_false(&egress_needed)) {
+ if (static_branch_unlikely(&egress_needed_key)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
@@ -3606,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *orig_skb = skb;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ skb = validate_xmit_skb_list(skb, dev, &again);
+ if (skb != orig_skb)
+ goto drop;
+
+ skb_set_queue_mapping(skb, queue_id);
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -3975,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct netdev_rx_queue *rxqueue;
+ void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
- struct xdp_buff xdp;
- void *orig_data;
int hlen, off;
u32 mac_len;
@@ -4015,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
*/
mac_len = skb->data - skb_mac_header(skb);
hlen = skb_headlen(skb) + mac_len;
- xdp.data = skb->data - mac_len;
- xdp.data_meta = xdp.data;
- xdp.data_end = xdp.data + hlen;
- xdp.data_hard_start = skb->data - skb_headroom(skb);
- orig_data = xdp.data;
+ xdp->data = skb->data - mac_len;
+ xdp->data_meta = xdp->data;
+ xdp->data_end = xdp->data + hlen;
+ xdp->data_hard_start = skb->data - skb_headroom(skb);
+ orig_data_end = xdp->data_end;
+ orig_data = xdp->data;
rxqueue = netif_get_rxqueue(skb);
- xdp.rxq = &rxqueue->xdp_rxq;
+ xdp->rxq = &rxqueue->xdp_rxq;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
- off = xdp.data - orig_data;
+ off = xdp->data - orig_data;
if (off > 0)
__skb_pull(skb, off);
else if (off < 0)
__skb_push(skb, -off);
skb->mac_header += off;
+ /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+ * pckt.
+ */
+ off = orig_data_end - xdp->data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
+ skb->len -= off;
+
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
__skb_push(skb, mac_len);
break;
case XDP_PASS:
- metalen = xdp.data - xdp.data_meta;
+ metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
break;
@@ -4084,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
}
EXPORT_SYMBOL_GPL(generic_xdp_tx);
-static struct static_key generic_xdp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
if (xdp_prog) {
- u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+ struct xdp_buff xdp;
+ u32 act;
int err;
+ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
err = xdp_do_generic_redirect(skb->dev, skb,
- xdp_prog);
+ &xdp, xdp_prog);
if (err)
goto out_redir;
- /* fallthru to submit skb */
+ break;
case XDP_TX:
generic_xdp_tx(skb, xdp_prog);
break;
@@ -4122,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -4494,7 +4548,7 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
- if (static_key_false(&ingress_needed)) {
+ if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
@@ -4654,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
bpf_prog_put(old);
if (old && !new) {
- static_key_slow_dec(&generic_xdp_needed);
+ static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
- static_key_slow_inc(&generic_xdp_needed);
+ static_branch_inc(&generic_xdp_needed_key);
dev_disable_lro(dev);
dev_disable_gro_hw(dev);
}
@@ -4684,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -7852,6 +7906,8 @@ int register_netdevice(struct net_device *dev)
int ret;
struct net *net = dev_net(dev);
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..22099705cc41 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!attrs->set)
+ return 0;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->split_subport_number))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_port *devlink_port,
enum devlink_command cmd, u32 portid,
@@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
ibdev->name))
goto nla_put_failure;
}
- if (devlink_port->split &&
- nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
- devlink_port->split_group))
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
return 0;
}
-static int devlink_port_split(struct devlink *devlink,
- u32 port_index, u32 count)
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+ u32 count, struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_split)
- return devlink->ops->port_split(devlink, port_index, count);
+ return devlink->ops->port_split(devlink, port_index, count,
+ extack);
return -EOPNOTSUPP;
}
@@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
- return devlink_port_split(devlink, port_index, count);
+ return devlink_port_split(devlink, port_index, count, info->extack);
}
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+ struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_unsplit)
- return devlink->ops->port_unsplit(devlink, port_index);
+ return devlink->ops->port_unsplit(devlink, port_index, extack);
return -EOPNOTSUPP;
}
@@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
return -EINVAL;
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- return devlink_port_unsplit(devlink, port_index);
+ return devlink_port_unsplit(devlink, port_index, info->extack);
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -1807,7 +1828,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
return 0;
nla_put_failure:
- genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);
nlmsg_free(dump_ctx->skb);
return -EMSGSIZE;
}
@@ -2230,7 +2249,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2532,7 +2550,6 @@ nla_put_failure:
err = -EMSGSIZE;
err_resource_put:
err_skb_send_alloc:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
return err;
}
- return devlink->ops->reload(devlink);
+ return devlink->ops->reload(devlink, info->extack);
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_eswitch_set_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
@@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
/**
- * devlink_port_split_set - Set port is split
+ * devlink_port_attrs_set - Set port attributes
*
* @devlink_port: devlink port
- * @split_group: split group - identifies group split port is part of
+ * @flavour: flavour of the port
+ * @port_number: number of the port that is facing user, for example
+ * the front panel port number
+ * @split: indicates if this is split port
+ * @split_subport_number: if the port is split, this is the number
+ * of subport.
*/
-void devlink_port_split_set(struct devlink_port *devlink_port,
- u32 split_group)
-{
- devlink_port->split = true;
- devlink_port->split_group = split_group;
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ u32 port_number, bool split,
+ u32 split_subport_number)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ attrs->set = true;
+ attrs->flavour = flavour;
+ attrs->port_number = port_number;
+ attrs->split = split;
+ attrs->split_subport_number = split_subport_number;
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!attrs->set)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (!attrs->split)
+ n = snprintf(name, len, "p%u", attrs->port_number);
+ else
+ n = snprintf(name, len, "p%us%u", attrs->port_number,
+ attrs->split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..2d9b37f8944a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
*/
.refcnt = REFCOUNT_INIT(1),
};
+EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ba02f0dfe85c..c15075dc7572 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
};
static const char
@@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int phy_get_sset_count(struct phy_device *phydev)
-{
- int ret;
-
- if (phydev->drv->get_sset_count &&
- phydev->drv->get_strings &&
- phydev->drv->get_stats) {
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_sset_count(phydev);
- mutex_unlock(&phydev->lock);
-
- return ret;
- }
-
- return -EOPNOTSUPP;
-}
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_TUNABLES)
return ARRAY_SIZE(phy_tunable_strings);
- if (sset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- return phy_get_sset_count(dev->phydev);
- else
- return -EOPNOTSUPP;
- }
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,
memcpy(data, tunable_strings, sizeof(tunable_strings));
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
- else if (stringset == ETH_SS_PHY_STATS) {
- struct phy_device *phydev = dev->phydev;
-
- if (phydev) {
- mutex_lock(&phydev->lock);
- phydev->drv->get_strings(phydev, data);
- mutex_unlock(&phydev->lock);
- } else {
- return;
- }
- } else
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
u64 *data;
int ret, n_stats;
- if (!phydev)
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
return -EOPNOTSUPP;
- n_stats = phy_get_sset_count(phydev);
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- phydev->drv->get_stats(phydev, &stats, data);
- mutex_unlock(&phydev->lock);
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c173..126ffc5bc630 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
}
EXPORT_SYMBOL_GPL(fib_rules_seq_read);
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
- struct fib_rules_ops *ops)
-{
- int err = -EINVAL;
-
- if (frh->src_len)
- if (tb[FRA_SRC] == NULL ||
- frh->src_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_SRC]) != ops->addr_size)
- goto errout;
-
- if (frh->dst_len)
- if (tb[FRA_DST] == NULL ||
- frh->dst_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_DST]) != ops->addr_size)
- goto errout;
-
- err = 0;
-errout:
- return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
- struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
{
struct fib_rule *r;
list_for_each_entry(r, &ops->rules_list, list) {
- if (r->action != rule->action)
+ if (rule->action && r->action != rule->action)
continue;
- if (r->table != rule->table)
+ if (rule->table && r->table != rule->table)
continue;
- if (r->pref != rule->pref)
+ if (user_priority && r->pref != rule->pref)
continue;
- if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
continue;
- if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
continue;
- if (r->mark != rule->mark)
+ if (rule->mark && r->mark != rule->mark)
continue;
- if (r->mark_mask != rule->mark_mask)
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
continue;
- if (r->tun_id != rule->tun_id)
+ if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
if (r->fr_net != rule->fr_net)
continue;
- if (r->l3mdev != rule->l3mdev)
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
- if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
- !uid_eq(r->uid_range.end, rule->uid_range.end))
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
continue;
- if (r->ip_proto != rule->ip_proto)
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
continue;
- if (!fib_rule_port_range_compare(&r->sport_range,
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
&rule->sport_range))
continue;
- if (!fib_rule_port_range_compare(&r->dport_range,
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
if (!ops->compare(r, frh, tb))
continue;
- return 1;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
}
+
return 0;
}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL, unresolved = 0;
-
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
- goto errout;
+ struct fib_rule *nlrule = NULL;
+ int err = -EINVAL;
- ops = lookup_rules_ops(net, frh->family);
- if (ops == NULL) {
- err = -EAFNOSUPPORT;
- goto errout;
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
- goto errout;
-
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
- goto errout;
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
- if (rule == NULL) {
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (!nlrule) {
err = -ENOMEM;
goto errout;
}
- refcount_set(&rule->refcnt, 1);
- rule->fr_net = net;
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
- rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
- : fib_default_rule_pref(ops);
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ } else {
+ nlrule->pref = fib_default_rule_pref(ops);
+ }
- rule->proto = tb[FRA_PROTOCOL] ?
+ nlrule->proto = tb[FRA_PROTOCOL] ?
nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
- rule->iifindex = -1;
- nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->iifname);
+ nlrule->iifindex = -1;
+ nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
- rule->iifindex = dev->ifindex;
+ nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
struct net_device *dev;
- rule->oifindex = -1;
- nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->oifname);
+ nlrule->oifindex = -1;
+ nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
- rule->oifindex = dev->ifindex;
+ nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
- rule->mark = nla_get_u32(tb[FRA_FWMARK]);
- if (rule->mark)
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
- rule->mark_mask = 0xFFFFFFFF;
+ nlrule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
- rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
if (tb[FRA_TUN_ID])
- rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
err = -EINVAL;
- if (tb[FRA_L3MDEV]) {
-#ifdef CONFIG_NET_L3_MASTER_DEV
- rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
- if (rule->l3mdev != 1)
-#endif
- goto errout_free;
- }
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
- rule->action = frh->action;
- rule->flags = frh->flags;
- rule->table = frh_get_table(frh, tb);
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
if (tb[FRA_SUPPRESS_PREFIXLEN])
- rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
else
- rule->suppress_prefixlen = -1;
+ nlrule->suppress_prefixlen = -1;
if (tb[FRA_SUPPRESS_IFGROUP])
- rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
else
- rule->suppress_ifgroup = -1;
+ nlrule->suppress_ifgroup = -1;
if (tb[FRA_GOTO]) {
- if (rule->action != FR_ACT_GOTO)
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
goto errout_free;
+ }
- rule->target = nla_get_u32(tb[FRA_GOTO]);
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
/* Backward jumps are prohibited to avoid endless loops */
- if (rule->target <= rule->pref)
+ if (nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
goto errout_free;
-
- list_for_each_entry(r, &ops->rules_list, list) {
- if (r->pref == rule->target) {
- RCU_INIT_POINTER(rule->ctarget, r);
- break;
- }
}
-
- if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
- unresolved = 1;
- } else if (rule->action == FR_ACT_GOTO)
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
+ }
- if (rule->l3mdev && rule->table)
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
goto errout_free;
+ }
if (tb[FRA_UID_RANGE]) {
if (current_user_ns() != net->user_ns) {
err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
goto errout_free;
}
- rule->uid_range = nla_get_kuid_range(tb);
+ nlrule->uid_range = nla_get_kuid_range(tb);
- if (!uid_range_set(&rule->uid_range) ||
- !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
goto errout_free;
+ }
} else {
- rule->uid_range = fib_kuid_range_unset;
+ nlrule->uid_range = fib_kuid_range_unset;
}
if (tb[FRA_IP_PROTO])
- rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
if (tb[FRA_SPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &rule->sport_range);
- if (err)
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
+ }
}
if (tb[FRA_DPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &rule->dport_range);
- if (err)
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
+ }
}
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ int err = -EINVAL, unresolved = 0;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
- rule_exists(ops, frh, tb, rule)) {
+ rule_find(ops, frh, tb, rule, user_priority)) {
err = -EEXIST;
goto errout_free;
}
- err = ops->configure(rule, skb, frh, tb);
+ err = ops->configure(rule, skb, frh, tb, extack);
if (err < 0)
goto errout_free;
@@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
@@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rule_port_range sprange = {0, 0};
- struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r;
+ struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- struct fib_kuid_range range;
int err = -EINVAL;
+ bool user_priority = false;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
+ }
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
+ }
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
goto errout;
- if (tb[FRA_UID_RANGE]) {
- range = nla_get_kuid_range(tb);
- if (!uid_range_set(&range)) {
- err = -EINVAL;
- goto errout;
- }
- } else {
- range = fib_kuid_range_unset;
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout;
}
- if (tb[FRA_SPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &sprange);
- if (err)
- goto errout;
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
}
- if (tb[FRA_DPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &dprange);
+ if (ops->delete) {
+ err = ops->delete(rule);
if (err)
goto errout;
}
- list_for_each_entry(rule, &ops->rules_list, list) {
- if (tb[FRA_PROTOCOL] &&
- (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
- continue;
-
- if (frh->action && (frh->action != rule->action))
- continue;
-
- if (frh_get_table(frh, tb) &&
- (frh_get_table(frh, tb) != rule->table))
- continue;
-
- if (tb[FRA_PRIORITY] &&
- (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
- continue;
-
- if (tb[FRA_IIFNAME] &&
- nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
- continue;
-
- if (tb[FRA_OIFNAME] &&
- nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
- continue;
-
- if (tb[FRA_FWMARK] &&
- (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
- continue;
-
- if (tb[FRA_FWMASK] &&
- (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
- continue;
-
- if (tb[FRA_TUN_ID] &&
- (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
- continue;
-
- if (tb[FRA_L3MDEV] &&
- (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
- continue;
-
- if (uid_range_set(&range) &&
- (!uid_eq(rule->uid_range.start, range.start) ||
- !uid_eq(rule->uid_range.end, range.end)))
- continue;
-
- if (tb[FRA_IP_PROTO] &&
- (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
- continue;
-
- if (fib_rule_port_range_set(&sprange) &&
- !fib_rule_port_range_compare(&rule->sport_range, &sprange))
- continue;
-
- if (fib_rule_port_range_set(&dprange) &&
- !fib_rule_port_range_compare(&rule->dport_range, &dprange))
- continue;
-
- if (!ops->compare(rule, frh, tb))
- continue;
-
- if (rule->flags & FIB_RULE_PERMANENT) {
- err = -EPERM;
- goto errout;
- }
-
- if (ops->delete) {
- err = ops->delete(rule);
- if (err)
- goto errout;
- }
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
- if (rule->tun_id)
- ip_tunnel_unneed_metadata();
+ list_del_rcu(&rule->list);
- list_del_rcu(&rule->list);
-
- if (rule->action == FR_ACT_GOTO) {
- ops->nr_goto_rules--;
- if (rtnl_dereference(rule->ctarget) == NULL)
- ops->unresolved_rules--;
- }
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
- /*
- * Check if this rule is a target to any of them. If so,
- * adjust to the next one with the same preference or
- * disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules, except
- * current if it is goto rule, have actually been added.
- */
- if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
-
- n = list_next_entry(rule, list);
- if (&n->list == &ops->rules_list || n->pref != rule->pref)
- n = NULL;
- list_for_each_entry(r, &ops->rules_list, list) {
- if (rtnl_dereference(r->ctarget) != rule)
- continue;
- rcu_assign_pointer(r->ctarget, n);
- if (!n)
- ops->unresolved_rules++;
- }
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
}
-
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
- fib_rule_put(rule);
- flush_route_cache(ops);
- rules_ops_put(ops);
- return 0;
}
- err = -ENOENT;
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+ NULL);
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
errout:
+ if (nlrule)
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 201ff36b17a8..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,7 +57,17 @@
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
+#include <net/xfrm.h>
#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
return skb_get_poff(skb);
}
-BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_0(__get_raw_cpu_id)
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u8 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return *(u8 *)ptr;
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u16 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be16(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u32 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (likely(offset >= 0)) {
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be32(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_0(bpf_get_raw_cpu_id)
{
return raw_smp_processor_id();
}
static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = __get_raw_cpu_id,
+ .func = bpf_get_raw_cpu_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
/* Emit call(arg1=CTX, arg2=A, arg3=X) */
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
- *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
break;
case SKF_AD_OFF + SKF_AD_NLATTR:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
break;
case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
break;
case SKF_AD_OFF + SKF_AD_CPU:
- *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
break;
case SKF_AD_OFF + SKF_AD_RANDOM:
*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
@@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
return true;
}
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+ int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+ bool endian = BPF_SIZE(fp->code) == BPF_H ||
+ BPF_SIZE(fp->code) == BPF_W;
+ bool indirect = BPF_MODE(fp->code) == BPF_IND;
+ const int ip_align = NET_IP_ALIGN;
+ struct bpf_insn *insn = *insnp;
+ int offset = fp->k;
+
+ if (!indirect &&
+ ((unaligned_ok && offset >= 0) ||
+ (!unaligned_ok && offset >= 0 &&
+ offset + ip_align >= 0 &&
+ offset + ip_align % size == 0))) {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
+ offset);
+ if (endian)
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+ *insn++ = BPF_JMP_A(8);
+ }
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+ if (fp->k)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+ }
+
+ switch (BPF_SIZE(fp->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+ break;
+ default:
+ return false;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn = BPF_EXIT_INSN();
+
+ *insnp = insn;
+ return true;
+}
+
/**
* bpf_convert_filter - convert filter program
* @prog: the user passed filter program
* @len: the length of the user passed filter program
* @new_prog: allocated 'struct bpf_prog' or NULL
* @new_len: pointer to store length of converted program
+ * @seen_ld_abs: bool whether we've seen ld_abs/ind
*
* Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
* style extended BPF (eBPF).
* Conversion workflow:
*
* 1) First pass for calculating the new program length:
- * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
*
* 2) 2nd pass to remap in two passes: 1st pass finds new
* jump offsets, 2nd pass remapping:
- * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_prog *new_prog, int *new_len)
+ struct bpf_prog *new_prog, int *new_len,
+ bool *seen_ld_abs)
{
int new_flen = 0, pass = 0, target, i, stack_off;
struct bpf_insn *new_insn, *first_insn = NULL;
@@ -410,12 +562,27 @@ do_pass:
* do this ourself. Initial CTX is present in BPF_REG_ARG1.
*/
*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ if (*seen_ld_abs) {
+ /* For packet access in classic BPF, cache skb->data
+ * in callee-saved BPF R8 and skb->len - skb->data_len
+ * (headlen) in BPF R9. Since classic BPF is read-only
+ * on CTX, we only need to cache it once.
+ */
+ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ BPF_REG_D, BPF_REG_CTX,
+ offsetof(struct sk_buff, data));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, data_len));
+ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+ }
} else {
new_insn += 3;
}
for (i = 0; i < len; fp++, i++) {
- struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn tmp_insns[32] = { };
struct bpf_insn *insn = tmp_insns;
if (addrs)
@@ -458,6 +625,11 @@ do_pass:
BPF_MODE(fp->code) == BPF_ABS &&
convert_bpf_extensions(fp, &insn))
break;
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ convert_bpf_ld_abs(fp, &insn)) {
+ *seen_ld_abs = true;
+ break;
+ }
if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -567,21 +739,31 @@ jmp_rest:
break;
/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
- case BPF_LDX | BPF_MSH | BPF_B:
- /* tmp = A */
- *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ case BPF_LDX | BPF_MSH | BPF_B: {
+ struct sock_filter tmp = {
+ .code = BPF_LD | BPF_ABS | BPF_B,
+ .k = fp->k,
+ };
+
+ *seen_ld_abs = true;
+
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = BPF_R0 = *(u8 *) (skb->data + K) */
- *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ convert_bpf_ld_abs(&tmp, &insn);
+ insn++;
/* A &= 0xf */
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
/* A <<= 2 */
*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* tmp = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
/* X = A */
*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = tmp */
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
-
+ }
/* RET_K is remaped into 2 insns. RET_A case doesn't need an
* extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
*/
@@ -663,6 +845,8 @@ jmp_rest:
if (!new_prog) {
/* Only calculating new length. */
*new_len = new_insn - first_insn;
+ if (*seen_ld_abs)
+ *new_len += 4; /* Prologue bits. */
return 0;
}
@@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
struct sock_filter *old_prog;
struct bpf_prog *old_fp;
int err, new_len, old_len = fp->len;
+ bool seen_ld_abs = false;
/* We are free to overwrite insns et al right here as it
* won't be used at this point in time anymore internally
@@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
}
/* 1st pass: calculate the new program length. */
- err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+ &seen_ld_abs);
if (err)
goto out_err_free;
@@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
fp->len = new_len;
/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
- err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+ &seen_ld_abs);
if (err)
/* 2nd bpf_convert_filter() can fail only if it fails
* to allocate memory, remapping must succeed. Note,
@@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+ u32, offset, void *, to, u32, len, u32, start_header)
+{
+ u8 *ptr;
+
+ if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+ goto err_clear;
+
+ switch (start_header) {
+ case BPF_HDR_START_MAC:
+ ptr = skb_mac_header(skb) + offset;
+ break;
+ case BPF_HDR_START_NET:
+ ptr = skb_network_header(skb) + offset;
+ break;
+ default:
+ goto err_clear;
+ }
+
+ if (likely(ptr >= skb_mac_header(skb) &&
+ ptr + len <= skb_tail_pointer(skb))) {
+ memcpy(to, ptr, len);
+ return 0;
+ }
+
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+ .func = bpf_skb_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
/* Idea is the following: should the needed direct read/write
@@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
@@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.key = key;
tcb->bpf.flags = flags;
- tcb->bpf.map = map;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
return SK_PASS;
}
@@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct sock *do_sk_redirect_map(struct sk_buff *skb)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- struct sock *sk = NULL;
-
- if (tcb->bpf.map) {
- sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
- tcb->bpf.key = 0;
- tcb->bpf.map = NULL;
- }
-
- return sk;
+ return tcb->bpf.sk_redir;
}
static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
{
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->key = key;
msg->flags = flags;
- msg->map = map;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
return SK_PASS;
}
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
{
- struct sock *sk = NULL;
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
- if (msg->map) {
- sk = __sock_map_lookup_elem(msg->map, msg->key);
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
- msg->key = 0;
- msg->map = NULL;
- }
+ return SK_PASS;
+}
- return sk;
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ return msg->sk_redir;
}
static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
@@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.func = bpf_skb_vlan_push,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
@@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.func = bpf_skb_vlan_pop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
@@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
- void *data_start = xdp->data_hard_start + metalen;
+ void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
@@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_end = xdp->data_end + offset;
+
+ /* only shrinking is allowed for now. */
+ if (unlikely(offset >= 0))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
- if (unlikely(meta < xdp->data_hard_start ||
+ if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
@@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev,
struct xdp_buff *xdp,
u32 index)
{
- int err;
+ struct xdp_frame *xdpf;
+ int sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
- if (err)
- return err;
- dev->netdev_ops->ndo_xdp_flush(dev);
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
+ if (sent <= 0)
+ return sent;
return 0;
}
@@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
{
int err;
- if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
- struct net_device *dev = fwd;
-
- if (!dev->netdev_ops->ndo_xdp_xmit)
- return -EOPNOTSUPP;
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP: {
+ struct bpf_dtab_netdev *dst = fwd;
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+ err = dev_map_enqueue(dst, xdp, dev_rx);
if (err)
return err;
__dev_map_insert_ctx(map, index);
-
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ break;
+ }
+ case BPF_MAP_TYPE_CPUMAP: {
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (err)
return err;
__cpu_map_insert_ctx(map, index);
+ break;
+ }
+ case BPF_MAP_TYPE_XSKMAP: {
+ struct xdp_sock *xs = fwd;
+
+ err = __xsk_map_redirect(map, xdp, xs);
+ return err;
+ }
+ default:
+ break;
}
return 0;
}
@@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void)
case BPF_MAP_TYPE_CPUMAP:
__cpu_map_flush(map);
break;
+ case BPF_MAP_TYPE_XSKMAP:
+ __xsk_map_flush(map);
+ break;
default:
break;
}
@@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
return __dev_map_lookup_elem(map, index);
case BPF_MAP_TYPE_CPUMAP:
return __cpu_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_XSKMAP:
+ return __xsk_map_lookup_elem(map, index);
default:
return NULL;
}
@@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
- struct net_device *fwd = NULL;
u32 index = ri->ifindex;
+ void *fwd = NULL;
int err = 0;
ri->ifindex = 0;
@@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
goto err;
skb->dev = fwd;
+ generic_xdp_tx(skb, xdp_prog);
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ struct xdp_sock *xs = fwd;
+
+ err = xsk_generic_rcv(xs, xdp);
+ if (err)
+ goto err;
+ consume_skb(skb);
} else {
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
@@ -2965,7 +3281,7 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
u32 index = ri->ifindex;
@@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
int err = 0;
if (ri->map)
- return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
ri->ifindex = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
skb->dev = fwd;
_trace_xdp_redirect(dev, xdp_prog, index);
+ generic_xdp_tx(skb, xdp_prog);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
@@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data)
- return true;
-
- return false;
-}
-
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
unsigned long off, unsigned long len)
{
@@ -3148,6 +3444,7 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3155,6 +3452,8 @@ set_compat:
to->tunnel_label = be32_to_cpu(info->key.label);
} else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->tunnel_label = 0;
}
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+ .func = bpf_skb_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
unsigned long off, unsigned long len)
{
@@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = {
.arg3_type = ARG_CONST_SIZE,
};
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ to->ext = 0;
+
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+ const struct neighbour *neigh,
+ const struct net_device *dev)
+{
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+ params->h_vlan_TCI = 0;
+ params->h_vlan_proto = 0;
+
+ return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in_device *in_dev;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct fib_result res;
+ struct fib_nh *nh;
+ struct flowi4 fl4;
+ int err;
+ u32 mtu;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl4.flowi4_iif = 1;
+ fl4.flowi4_oif = params->ifindex;
+ } else {
+ fl4.flowi4_iif = params->ifindex;
+ fl4.flowi4_oif = 0;
+ }
+ fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+
+ fl4.flowi4_proto = params->l4_protocol;
+ fl4.daddr = params->ipv4_dst;
+ fl4.saddr = params->ipv4_src;
+ fl4.fl4_sport = params->sport;
+ fl4.fl4_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib_table *tb;
+
+ tb = fib_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+ } else {
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_secid = 0;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+ err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+ }
+
+ if (err || res.type != RTN_UNICAST)
+ return 0;
+
+ if (res.fi->fib_nhs > 1)
+ fib_select_path(net, &res, &fl4, NULL);
+
+ if (check_mtu) {
+ mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ nh = &res.fi->fib_nh[res.nh_sel];
+
+ /* do not handle lwt encaps right now */
+ if (nh->nh_lwtstate)
+ return 0;
+
+ dev = nh->nh_dev;
+ if (unlikely(!dev))
+ return 0;
+
+ if (nh->nh_gw)
+ params->ipv4_dst = nh->nh_gw;
+
+ params->rt_metric = res.fi->fib_priority;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so
+ * rcu_read_lock_bh is not needed here
+ */
+ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct fib6_info *f6i;
+ struct flowi6 fl6;
+ int strict = 0;
+ int oif;
+ u32 mtu;
+
+ /* link local addresses are never forwarded */
+ if (rt6_need_strict(dst) || rt6_need_strict(src))
+ return 0;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ idev = __in6_dev_get_safely(dev);
+ if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl6.flowi6_iif = 1;
+ oif = fl6.flowi6_oif = params->ifindex;
+ } else {
+ oif = fl6.flowi6_iif = params->ifindex;
+ fl6.flowi6_oif = 0;
+ strict = RT6_LOOKUP_F_HAS_SADDR;
+ }
+ fl6.flowlabel = params->flowinfo;
+ fl6.flowi6_scope = 0;
+ fl6.flowi6_flags = 0;
+ fl6.mp_hash = 0;
+
+ fl6.flowi6_proto = params->l4_protocol;
+ fl6.daddr = *dst;
+ fl6.saddr = *src;
+ fl6.fl6_sport = params->sport;
+ fl6.fl6_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib6_table *tb;
+
+ tb = ipv6_stub->fib6_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ } else {
+ fl6.flowi6_mark = 0;
+ fl6.flowi6_secid = 0;
+ fl6.flowi6_tun_key.tun_id = 0;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+ f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ return 0;
+
+ if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+ f6i->fib6_type != RTN_UNICAST))
+ return 0;
+
+ if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+ f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+ fl6.flowi6_oif, NULL,
+ strict);
+
+ if (check_mtu) {
+ mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ if (f6i->fib6_nh.nh_lwtstate)
+ return 0;
+
+ if (f6i->fib6_flags & RTF_GATEWAY)
+ *dst = f6i->fib6_nh.nh_gw;
+
+ dev = f6i->fib6_nh.nh_dev;
+ params->rt_metric = f6i->fib6_metric;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+ * not needed here. Can not use __ipv6_neigh_lookup_noref here
+ * because we need to get nd_tbl via the stub
+ */
+ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+ ndisc_hashfn, dst, dev);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+ .func = bpf_xdp_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ struct net *net = dev_net(skb->dev);
+ int index = -EAFNOSUPPORT;
+
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ index = bpf_ipv4_fib_lookup(net, params, flags, false);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ index = bpf_ipv6_fib_lookup(net, params, flags, false);
+ break;
+#endif
+ }
+
+ if (index > 0) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(net, index);
+ if (!is_skb_forwardable(dev, skb))
+ index = 0;
+ }
+
+ return index;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+ .func = bpf_skb_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+ int err;
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+ if (!seg6_validate_srh(srh, len))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EBADMSG;
+
+ err = seg6_do_srh_inline(skb, srh);
+ break;
+ case BPF_LWT_ENCAP_SEG6:
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+ u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_LWT_ENCAP_SEG6:
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+ .func = bpf_lwt_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_tlvs, *srh_end, *ptr;
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+ ptr = skb->data + offset;
+ if (ptr >= srh_tlvs && ptr + len <= srh_end)
+ srh_state->valid = 0;
+ else if (ptr < (void *)&srh->flags ||
+ ptr + len > (void *)&srh->segments)
+ return -EFAULT;
+
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+
+ memcpy(skb->data + offset, from, len);
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+ .func = bpf_lwt_seg6_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+ int err;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!srh_state->valid) {
+ if (unlikely((srh_state->hdrlen & 7) != 0))
+ return -EBADMSG;
+
+ srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+ if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+ return -EBADMSG;
+
+ srh_state->valid = 1;
+ }
+
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END_X:
+ if (param_len != sizeof(struct in6_addr))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+ case SEG6_LOCAL_ACTION_END_T:
+ if (param_len != sizeof(int))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_B6:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+ .func = bpf_lwt_seg6_action,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+ s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_end, *srh_tlvs, *ptr;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6hdr *hdr;
+ int srhoff = 0;
+ int ret;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+ ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+ srh_state->hdrlen);
+ ptr = skb->data + offset;
+
+ if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+ return -EFAULT;
+ if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+ return -EFAULT;
+
+ if (len > 0) {
+ ret = skb_cow_head(skb, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, offset, len);
+ } else {
+ ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (unlikely(ret < 0))
+ return ret;
+
+ hdr = (struct ipv6hdr *)skb->data;
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ srh_state->hdrlen += len;
+ srh_state->valid = 0;
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+ .func = bpf_lwt_seg6_adjust_srh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_adjust_room ||
+ func == bpf_skb_pull_data ||
+ func == bpf_clone_redirect ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head ||
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data ||
+ func == bpf_xdp_adjust_tail ||
+ func == bpf_lwt_push_encap ||
+ func == bpf_lwt_seg6_store_bytes ||
+ func == bpf_lwt_seg6_adjust_srh ||
+ func == bpf_lwt_seg6_action
+ )
+ return true;
+
+ return false;
+}
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_skb_pull_data:
return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
@@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
return &bpf_xdp_redirect_map_proto;
- default:
- return bpf_base_func_proto(func_id);
- }
-}
-
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_skb_load_bytes:
- return &bpf_skb_load_bytes_proto;
- case BPF_FUNC_skb_pull_data:
- return &bpf_skb_pull_data_proto;
- case BPF_FUNC_csum_diff:
- return &bpf_csum_diff_proto;
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_proto;
- case BPF_FUNC_get_route_realm:
- return &bpf_get_route_realm_proto;
- case BPF_FUNC_get_hash_recalc:
- return &bpf_get_hash_recalc_proto;
- case BPF_FUNC_perf_event_output:
- return &bpf_skb_event_output_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_skb_under_cgroup:
- return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_xdp_fib_lookup_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_msg_redirect_map:
return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
case BPF_FUNC_msg_apply_bytes:
return &bpf_msg_apply_bytes_proto;
case BPF_FUNC_msg_cork_bytes:
@@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_sk_redirect_map:
return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
default:
- return lwt_inout_func_proto(func_id, prog);
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ return &bpf_lwt_seg6_store_bytes_proto;
+ case BPF_FUNC_lwt_seg6_action:
+ return &bpf_lwt_seg6_action_proto;
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ return &bpf_lwt_seg6_adjust_srh_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
}
}
@@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
-
/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
enum bpf_access_type access_type,
@@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
return insn - insn_buf;
}
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+ struct bpf_insn *insn_buf)
+{
+ bool indirect = BPF_MODE(orig->code) == BPF_IND;
+ struct bpf_insn *insn = insn_buf;
+
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+ if (orig->imm)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+ }
+
+ switch (BPF_SIZE(orig->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+ break;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
@@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (type == BPF_WRITE)
+ if (type == BPF_WRITE) {
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ switch (off) {
+ case offsetof(struct xdp_md, rx_queue_index):
+ return __is_valid_xdp_access(off, size);
+ }
+ }
return false;
+ }
switch (off) {
case offsetof(struct xdp_md, data):
@@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
break;
default:
return false;
@@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP6_SENDMSG:
break;
default:
return false;
@@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
/* Only narrow read access allowed for now. */
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size,
switch (off) {
case offsetof(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
+ if (size != sizeof(__u64))
+ return false;
break;
case offsetof(struct sk_msg_md, data_end):
info->reg_type = PTR_TO_PACKET_END;
+ if (size != sizeof(__u64))
+ return false;
break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
}
if (off < 0 || off >= sizeof(struct sk_msg_md))
return false;
if (off % size != 0)
return false;
- if (size != sizeof(__u64))
- return false;
return true;
}
@@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
SK_FL_PROTO_SHIFT);
break;
+
+ case offsetof(struct bpf_sock_addr, msg_src_ip4):
+ /* Treat t_ctx as struct in_addr for msg_src_ip4. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+ s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+ /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+ s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+ break;
}
return insn - insn_buf;
@@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
+ int off;
+#endif
switch (si->off) {
case offsetof(struct sk_msg_md, data):
@@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, data_end));
break;
+ case offsetof(struct sk_msg_md, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+ offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip6[0]) ...
+ offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
}
return insn - insn_buf;
@@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.is_valid_access = tc_cls_act_is_valid_access,
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
- .get_func_proto = lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+ .get_func_proto = lwt_in_func_proto,
.is_valid_access = lwt_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_in_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+ .get_func_proto = lwt_out_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_out_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
@@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+ .get_func_proto = lwt_seg6local_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
const struct bpf_verifier_ops cg_sock_verifier_ops = {
.get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..53f96e4f7bf5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
EXPORT_SYMBOL(skb_get_hash_perturb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
- const struct flow_keys *keys, int hlen)
+ const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
*/
u32 skb_get_poff(const struct sk_buff *skb)
{
- struct flow_keys keys;
+ struct flow_keys_basic keys;
- if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+ if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
keys->ports.src = fl6->fl6_sport;
keys->ports.dst = fl6->fl6_dport;
keys->keyid.keyid = fl6->fl6_gre_key;
- keys->tags.flow_label = (__force u32)fl6->flowlabel;
+ keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
keys->basic.ip_proto = fl6->flowi6_proto;
return flow_hash_from_keys(keys);
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
},
};
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
static int __init init_default_flow_dissectors(void)
{
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
skb_flow_dissector_init(&flow_keys_dissector_symmetric,
flow_keys_dissector_symmetric_keys,
ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
- skb_flow_dissector_init(&flow_keys_buf_dissector,
- flow_keys_buf_dissector_keys,
- ARRAY_SIZE(flow_keys_buf_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_basic_dissector,
+ flow_keys_basic_dissector_keys,
+ ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1fb43bff417d..a7a9c3d738ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
write_lock(&n->lock);
state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
}
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (neigh->dead)
goto out;
+ neigh_update_ext_learned(neigh, flags, &notify);
+
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
+ if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
#include <trace/events/tcp.h>
#include <trace/events/fib.h>
#include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
#if IS_ENABLED(CONFIG_BRIDGE)
#include <trace/events/bridge.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..68bf07206744
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
+{
+ unsigned int ring_qsize = 1024; /* Default */
+
+ memcpy(&pool->p, params, sizeof(pool->p));
+
+ /* Validate only known flags were used */
+ if (pool->p.flags & ~(PP_FLAG_ALL))
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = pool->p.pool_size;
+
+ /* Sanity limit mem that can be pinned down */
+ if (ring_qsize > 32768)
+ return -E2BIG;
+
+ /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ struct page_pool *pool;
+ int err = 0;
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ err = page_pool_init(pool, params);
+ if (err < 0) {
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct ptr_ring *r = &pool->ring;
+ struct page *page;
+
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r))
+ return NULL;
+
+ /* Test for safe-context, caller should provide this guarantee */
+ if (likely(in_serving_softirq())) {
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ return page;
+ }
+ /* Slower-path: Alloc array empty, time to refill
+ *
+ * Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: the ring consumer lock is not really
+ * needed due to the softirq/NAPI protection, but
+ * later need the ability to reclaim pages on the
+ * ring. Thus, keeping the locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
+ return page;
+ }
+
+ /* Slow-path: Get page from locked ring queue */
+ page = ptr_ring_consume(&pool->ring);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t _gfp)
+{
+ struct page *page;
+ gfp_t gfp = _gfp;
+ dma_addr_t dma;
+
+ /* We could always set __GFP_COMP, and avoid this branch, as
+ * prep_new_page() can handle order-0 with __GFP_COMP.
+ */
+ if (pool->p.order)
+ gfp |= __GFP_COMP;
+
+ /* FUTURE development:
+ *
+ * Current slow-path essentially falls back to single page
+ * allocations, which doesn't improve performance. This code
+ * need bulk allocation support from the page allocator code.
+ */
+
+ /* Cache was empty, do real allocation */
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (!page)
+ return NULL;
+
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ goto skip_dma_map;
+
+ /* Setup DMA mapping: use page->private for DMA-addr
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page(pool->p.dev, page, 0,
+ (PAGE_SIZE << pool->p.order),
+ pool->p.dma_dir);
+ if (dma_mapping_error(pool->p.dev, dma)) {
+ put_page(page);
+ return NULL;
+ }
+ set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ struct page *page;
+
+ /* Fast-path: Get a page from cache */
+ page = __page_pool_get_cached(pool);
+ if (page)
+ return page;
+
+ /* Slow-path: cache empty, do real allocation */
+ page = __page_pool_alloc_pages_slow(pool, gfp);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+ struct page *page)
+{
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return;
+
+ /* DMA unmap */
+ dma_unmap_page(pool->p.dev, page_private(page),
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+ set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+ /* An optimization would be to call __free_pages(page, pool->p.order)
+ * knowing page is not part of page-cache (thus avoiding a
+ * __page_cache_release() call).
+ */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+ struct page *page)
+{
+ int ret;
+ /* BH protection not needed if current is serving softirq */
+ if (in_serving_softirq())
+ ret = ptr_ring_produce(&pool->ring, page);
+ else
+ ret = ptr_ring_produce_bh(&pool->ring, page);
+
+ return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+ struct page_pool *pool)
+{
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ return false;
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = page;
+ return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+ struct page *page, bool allow_direct)
+{
+ /* This allocator is optimized for the XDP mode that uses
+ * one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ */
+ if (likely(page_ref_count(page) == 1)) {
+ /* Read barrier done in page_ref_count / READ_ONCE */
+
+ if (allow_direct && in_serving_softirq())
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (!__page_pool_recycle_into_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ __page_pool_return_page(pool, page);
+ }
+ return;
+ }
+ /* Fallback/non-XDP mode: API user have elevated refcnt.
+ *
+ * Many drivers split up the page into fragments, and some
+ * want to keep doing this to save memory and do refcnt based
+ * recycling. Support this use case too, to ease drivers
+ * switching between XDP/non-XDP.
+ *
+ * In-case page_pool maintains the DMA mapping, API user must
+ * call page_pool_put_page once. In this elevated refcnt
+ * case, the DMA is unmapped/released, as driver is likely
+ * doing refcnt based recycle tricks, meaning another process
+ * will be invoking put_page.
+ */
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty recycle ring */
+ while ((page = ptr_ring_consume(&pool->ring))) {
+ /* Verify the refcnt invariant of cached pages */
+ if (!(page_ref_count(page) == 1))
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, page_ref_count(page));
+
+ __page_pool_return_page(pool, page);
+ }
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+ struct page_pool *pool;
+
+ pool = container_of(rcu, struct page_pool, rcu);
+
+ WARN(pool->alloc.count, "API usage violation");
+
+ __page_pool_empty_ring(pool);
+ ptr_ring_cleanup(&pool->ring, NULL);
+ kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty alloc cache, assume caller made sure this is
+ * no-longer in use, and page_pool_alloc_pages() cannot be
+ * call concurrently.
+ */
+ while (pool->alloc.count) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ __page_pool_return_page(pool, page);
+ }
+
+ /* No more consumers should exist, but producers could still
+ * be in-flight.
+ */
+ __page_pool_empty_ring(pool);
+
+ /* An xdp_mem_allocator can still ref page_pool pointer */
+ call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 45936922d7e2..5ef61222fdef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#define RTNL_MAX_TYPE 48
+#define RTNL_SLAVE_MAX_TYPE 36
+
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
{
int err;
+ /* Sanity-check max sizes to avoid stack buffer overflow. */
+ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+ ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+ return -EINVAL;
+
rtnl_lock();
err = __rtnl_link_register(ops);
rtnl_unlock();
@@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
- .rta_used = dst->__use,
- .rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
};
+ if (dst) {
+ ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ ci.rta_used = dst->__use;
+ ci.rta_clntref = atomic_read(&dst->__refcnt);
+ }
if (expires) {
unsigned long clock;
@@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
const struct net_device_ops *ops = dev->netdev_ops;
int err;
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
@@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = validate_linkmsg(dev, tb);
- if (err < 0)
- goto errout;
-
err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
errout:
return err;
@@ -2900,13 +2910,16 @@ replay:
}
if (1) {
- struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
- struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
struct nlattr **data = NULL;
struct nlattr **slave_data = NULL;
struct net *dest_net, *link_net = NULL;
if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE)
+ return -EINVAL;
+
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
err = nla_parse_nested(attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
@@ -2923,6 +2936,9 @@ replay:
}
if (m_ops) {
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
if (m_ops->slave_maxtype &&
linkinfo[IFLA_INFO_SLAVE_DATA]) {
err = nla_parse_nested(slave_attr,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_mac_header += off;
}
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
__copy_skb_header(new, old);
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+EXPORT_SYMBOL(skb_copy_header);
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
return n;
}
EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_clone_fraglist(n);
}
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
out:
return n;
}
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
skb->len + head_copy_len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
skb_headers_offset_update(n, newheadroom - oldheadroom);
@@ -1839,6 +1840,20 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0));
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
/**
* __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate
@@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/core/sock.c b/net/core/sock.c
index 2aed99a541d5..f333d75ef1a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
- x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
"rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
"rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
"rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
- "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
+ "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
+ "rlock-AF_MAX"
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
"wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
"wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
- "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
+ "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
+ "wlock-AF_MAX"
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
"elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
"elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
- "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
+ "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
+ "elock-AF_MAX"
};
/*
@@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
int sysctl_tstamp_allow_data __read_mostly = 1;
-struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(memalloc_socks);
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
/**
* sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
{
sock_set_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation |= __GFP_MEMALLOC;
- static_key_slow_inc(&memalloc_socks);
+ static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);
@@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
{
sock_reset_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation &= ~__GFP_MEMALLOC;
- static_key_slow_dec(&memalloc_socks);
+ static_branch_dec(&memalloc_socks_key);
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
@@ -724,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
- sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuse != val)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
+ sk->sk_reuse = val;
break;
case SO_REUSEPORT:
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuseport != valbool)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
sk->sk_reuseport = valbool;
break;
case SO_TYPE:
@@ -905,7 +922,10 @@ set_rcvbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
+ if (sock->ops->set_rcvlowat)
+ ret = sock->ops->set_rcvlowat(sk, val);
+ else
+ sk->sk_rcvlowat = val ? : 1;
break;
case SO_RCVTIMEO:
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,10 @@
*/
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/page_pool.h>
#include <net/xdp.h>
@@ -13,6 +17,105 @@
#define REG_STATE_UNREGISTERED 0x2
#define REG_STATE_UNUSED 0x3
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+ struct xdp_mem_info mem;
+ union {
+ void *allocator;
+ struct page_pool *page_pool;
+ struct zero_copy_allocator *zc_alloc;
+ };
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+ const u32 *k = data;
+ const u32 key = *k;
+
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ != sizeof(u32));
+
+ /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+ return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xdp_mem_allocator *xa = ptr;
+ u32 mem_id = *(u32 *)arg->key;
+
+ return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+ .nelem_hint = 64,
+ .head_offset = offsetof(struct xdp_mem_allocator, node),
+ .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
+ .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .max_size = MEM_ID_MAX,
+ .min_size = 8,
+ .automatic_shrinking = true,
+ .hashfn = xdp_mem_id_hashfn,
+ .obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+ struct xdp_mem_allocator *xa;
+
+ xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+ /* Allow this ID to be reused */
+ ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+ /* Notice, driver is expected to free the *allocator,
+ * e.g. page_pool, and MUST also use RCU free.
+ */
+
+ /* Poison memory */
+ xa->mem.id = 0xFFFF;
+ xa->mem.type = 0xF0F0;
+ xa->allocator = (void *)0xDEAD9001;
+
+ kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ struct xdp_mem_allocator *xa;
+ int id = xdp_rxq->mem.id;
+ int err;
+
+ if (id == 0)
+ return;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+
+ err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+ WARN_ON(err);
+
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+}
+
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+ __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
+
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+ struct rhashtable *rht;
+ int ret;
+
+ if (unlikely(mem_id_init))
+ return 0;
+
+ rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+ if (!rht)
+ return -ENOMEM;
+
+ ret = rhashtable_init(rht, &mem_id_rht_params);
+ if (ret < 0) {
+ kfree(rht);
+ return ret;
+ }
+ mem_id_ht = rht;
+ smp_mb(); /* mutex lock should provide enough pairing */
+ mem_id_init = true;
+
+ return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+ int retries = 1;
+ int id;
+
+again:
+ id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ /* Cyclic allocator, reset next id */
+ if (retries--) {
+ mem_id_next = MEM_ID_MIN;
+ goto again;
+ }
+ }
+ return id; /* errno */
+ }
+ mem_id_next = id + 1;
+
+ return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+ if (type == MEM_TYPE_PAGE_POOL)
+ return is_page_pool_compiled_in();
+
+ if (type >= MEM_TYPE_MAX)
+ return false;
+
+ return true;
+}
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+ gfp_t gfp = GFP_KERNEL;
+ int id, errno, ret;
+ void *ptr;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ if (!__is_supported_mem_type(type))
+ return -EOPNOTSUPP;
+
+ xdp_rxq->mem.type = type;
+
+ if (!allocator) {
+ if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ return -EINVAL; /* Setup time check page_pool req */
+ return 0;
+ }
+
+ /* Delay init of rhashtable to save memory if feature isn't used */
+ if (!mem_id_init) {
+ mutex_lock(&mem_id_lock);
+ ret = __mem_id_init_hash_table();
+ mutex_unlock(&mem_id_lock);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+ }
+
+ xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+ if (!xdp_alloc)
+ return -ENOMEM;
+
+ mutex_lock(&mem_id_lock);
+ id = __mem_id_cyclic_get(gfp);
+ if (id < 0) {
+ errno = id;
+ goto err;
+ }
+ xdp_rxq->mem.id = id;
+ xdp_alloc->mem = xdp_rxq->mem;
+ xdp_alloc->allocator = allocator;
+
+ /* Insert allocator into ID lookup table */
+ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+ if (IS_ERR(ptr)) {
+ errno = PTR_ERR(ptr);
+ goto err;
+ }
+
+ mutex_unlock(&mem_id_lock);
+
+ return 0;
+err:
+ mutex_unlock(&mem_id_lock);
+ kfree(xdp_alloc);
+ return errno;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection. The @napi_direct boolian
+ * is used for those calls sites. Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ unsigned long handle)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ switch (mem->type) {
+ case MEM_TYPE_PAGE_POOL:
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_put_page(xa->page_pool, page, napi_direct);
+ else
+ put_page(page);
+ rcu_read_unlock();
+ break;
+ case MEM_TYPE_PAGE_SHARED:
+ page_frag_free(data);
+ break;
+ case MEM_TYPE_PAGE_ORDER0:
+ page = virt_to_page(data); /* Assumes order0 page*/
+ put_page(page);
+ break;
+ case MEM_TYPE_ZERO_COPY:
+ /* NB! Only valid from an xdp_buff! */
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ xa->zc_alloc->free(xa->zc_alloc, handle);
+ rcu_read_unlock();
+ default:
+ /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ break;
+ }
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
[DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)},
[DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
[DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
+ [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
};
/* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
return -EMSGSIZE;
}
+ if (ops->dcbnl_getbuffer) {
+ struct dcbnl_buffer buffer;
+
+ memset(&buffer, 0, sizeof(buffer));
+ err = ops->dcbnl_getbuffer(netdev, &buffer);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+ return -EMSGSIZE;
+ }
+
app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
if (!app)
return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
goto err;
}
+ if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+ struct dcbnl_buffer *buffer =
+ nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+ err = ops->dcbnl_setbuffer(netdev, buffer);
+ if (err)
+ goto err;
+ }
+
if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
struct nlattr *attr;
int rem;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 37ccbe62eb1a..ba6fc3c1186b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (timeo < rto)
timeo = rto;
- tw->tw_timeout = DCCP_TIMEWAIT_LEN;
if (state == DCCP_TIME_WAIT)
timeo = DCCP_TIMEWAIT_LEN;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- if (frh->tos)
+ if (frh->tos) {
+ NL_SET_ERR_MSG(extack, "Invalid tos value");
goto errout;
+ }
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bbf2c82cf7b2..4183e4ba27a5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA
depends on HAVE_NET_DSA && MAY_USE_DEVLINK
depends on BRIDGE || BRIDGE=n
select NET_SWITCHDEV
- select PHYLIB
+ select PHYLINK
---help---
Say Y if you want to enable support for the hardware switches supported
by the Distributed Switch Architecture.
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 47725250b4ca..dc5d9af3dc80 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -272,7 +272,28 @@ static int dsa_port_setup(struct dsa_port *dp)
case DSA_PORT_TYPE_UNUSED:
break;
case DSA_PORT_TYPE_CPU:
+ /* dp->index is used now as port_number. However
+ * CPU ports should have separate numbering
+ * independent from front panel port numbers.
+ */
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_CPU,
+ dp->index, false, 0);
+ err = dsa_port_link_register_of(dp);
+ if (err) {
+ dev_err(ds->dev, "failed to setup link for port %d.%d\n",
+ ds->index, dp->index);
+ return err;
+ }
+ break;
case DSA_PORT_TYPE_DSA:
+ /* dp->index is used now as port_number. However
+ * DSA ports should have separate numbering
+ * independent from front panel port numbers.
+ */
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_DSA,
+ dp->index, false, 0);
err = dsa_port_link_register_of(dp);
if (err) {
dev_err(ds->dev, "failed to setup link for port %d.%d\n",
@@ -281,6 +302,9 @@ static int dsa_port_setup(struct dsa_port *dp)
}
break;
case DSA_PORT_TYPE_USER:
+ devlink_port_attrs_set(&dp->devlink_port,
+ DEVLINK_PORT_FLAVOUR_PHYSICAL,
+ dp->index, false, 0);
err = dsa_slave_create(dp);
if (err)
dev_err(ds->dev, "failed to create slave for port %d.%d\n",
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 053731473c99..3964c6f7a7c0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -75,15 +75,6 @@ struct dsa_slave_priv {
/* DSA port data, such as switch, port index, etc. */
struct dsa_port *dp;
- /*
- * The phylib phy_device pointer for the PHY connected
- * to this port.
- */
- phy_interface_t phy_interface;
- int old_link;
- int old_pause;
- int old_duplex;
-
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *netpoll;
#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
int port = cpu_dp->index;
int count = 0;
- if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ if (ops->get_sset_count && ops->get_ethtool_stats) {
count = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_ethtool_stats(dev, stats, data);
}
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
ds->ops->get_ethtool_stats(ds, port, data + count);
}
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
static int dsa_master_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops && ops->get_sset_count)
- count += ops->get_sset_count(dev, sset);
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops->get_sset_count)
+ count = ops->get_sset_count(dev, sset);
+
+ if (count < 0)
+ count = 0;
- if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, cpu_dp->index);
+ if (ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
return count;
}
@@ -64,19 +96,28 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
- if (ops && ops->get_sset_count && ops->get_strings) {
- mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
ops->get_strings(dev, stringset, data);
}
- if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+ if (ds->ops->get_strings) {
ndata = data + mcount * len;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
- ds->ops->get_strings(ds, port, ndata);
- count = ds->ops->get_sset_count(ds, port);
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
for (i = 0; i < count; i++) {
memmove(ndata + (i * len + sizeof(pfx)),
ndata + i * len, len - sizeof(pfx));
@@ -102,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
ops->get_sset_count = dsa_master_get_sset_count;
ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
ops->get_strings = dsa_master_get_strings;
+ ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
dev->ethtool_ops = ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..ed0595459df1 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
+ if (netif_is_bridge_master(vlan->obj.orig_dev))
+ return -EOPNOTSUPP;
+
if (br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
@@ -267,31 +270,47 @@ int dsa_port_vlan_del(struct dsa_port *dp,
.vlan = vlan,
};
+ if (netif_is_bridge_master(vlan->obj.orig_dev))
+ return -EOPNOTSUPP;
+
if (br_vlan_enabled(dp->bridge_dev))
return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
return 0;
}
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
- struct device_node *port_dn = dp->dn;
struct device_node *phy_dn;
- struct dsa_switch *ds = dp->ds;
struct phy_device *phydev;
- int port = dp->index;
- int err = 0;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+ phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
if (!phy_dn)
- return 0;
+ return NULL;
phydev = of_phy_find_device(phy_dn);
if (!phydev) {
- err = -EPROBE_DEFER;
- goto err_put_of;
+ of_node_put(phy_dn);
+ return ERR_PTR(-EPROBE_DEFER);
}
+ return phydev;
+}
+
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct phy_device *phydev;
+ int port = dp->index;
+ int err = 0;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (!phydev)
+ return 0;
+
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
if (enable) {
err = genphy_config_init(phydev);
if (err < 0)
@@ -317,8 +336,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
err_put_dev:
put_device(&phydev->mdio.dev);
-err_put_of:
- of_node_put(phy_dn);
return err;
}
@@ -372,3 +389,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
else
dsa_port_setup_phy_of(dp, false);
}
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_strings(phydev, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_stats(phydev, NULL, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_sset_count(phydev);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..1e3b6a6d8a40 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -13,6 +13,7 @@
#include <linux/netdevice.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
#include <linux/of_net.h>
#include <linux/of_mdio.h>
#include <linux/mdio.h>
@@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
if (err)
goto clear_promisc;
- if (dev->phydev)
- phy_start(dev->phydev);
+ phylink_start(dp->pl);
return 0;
@@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev)
struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- if (dev->phydev)
- phy_stop(dev->phydev);
+ phylink_stop(dp->pl);
dsa_port_disable(dp, dev->phydev);
@@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
- if (!dev->phydev)
- return -ENODEV;
-
- return phy_mii_ioctl(dev->phydev, ifr, cmd);
+ return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
}
static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -498,14 +494,11 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
ds->ops->get_regs(ds, dp->index, regs, _p);
}
-static u32 dsa_slave_get_link(struct net_device *dev)
+static int dsa_slave_nway_reset(struct net_device *dev)
{
- if (!dev->phydev)
- return -ENODEV;
-
- genphy_update_link(dev->phydev);
+ struct dsa_port *dp = dsa_slave_to_port(dev);
- return dev->phydev->link;
+ return phylink_ethtool_nway_reset(dp->pl);
}
static int dsa_slave_get_eeprom_len(struct net_device *dev)
@@ -560,7 +553,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
if (ds->ops->get_strings)
- ds->ops->get_strings(ds, dp->index, data + 4 * len);
+ ds->ops->get_strings(ds, dp->index, stringset,
+ data + 4 * len);
}
}
@@ -605,7 +599,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
count = 4;
if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, dp->index);
+ count += ds->ops->get_sset_count(ds, dp->index, sset);
return count;
}
@@ -618,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_switch *ds = dp->ds;
+ phylink_ethtool_get_wol(dp->pl, w);
+
if (ds->ops->get_wol)
ds->ops->get_wol(ds, dp->index, w);
}
@@ -628,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
struct dsa_switch *ds = dp->ds;
int ret = -EOPNOTSUPP;
+ phylink_ethtool_set_wol(dp->pl, w);
+
if (ds->ops->set_wol)
ret = ds->ops->set_wol(ds, dp->index, w);
@@ -651,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
if (ret)
return ret;
- if (e->eee_enabled) {
- ret = phy_init_eee(dev->phydev, 0);
- if (ret)
- return ret;
- }
-
- return phy_ethtool_set_eee(dev->phydev, e);
+ return phylink_ethtool_set_eee(dp->pl, e);
}
static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -677,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
if (ret)
return ret;
- return phy_ethtool_get_eee(dev->phydev, e);
+ return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_slave_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_slave_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return phylink_ethtool_ksettings_set(dp->pl, cmd);
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -980,8 +988,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
.get_regs = dsa_slave_get_regs,
- .nway_reset = phy_ethtool_nway_reset,
- .get_link = dsa_slave_get_link,
+ .nway_reset = dsa_slave_nway_reset,
+ .get_link = ethtool_op_get_link,
.get_eeprom_len = dsa_slave_get_eeprom_len,
.get_eeprom = dsa_slave_get_eeprom,
.set_eeprom = dsa_slave_set_eeprom,
@@ -992,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_wol = dsa_slave_get_wol,
.set_eee = dsa_slave_set_eee,
.get_eee = dsa_slave_get_eee,
- .get_link_ksettings = phy_ethtool_get_link_ksettings,
- .set_link_ksettings = phy_ethtool_set_link_ksettings,
+ .get_link_ksettings = dsa_slave_get_link_ksettings,
+ .set_link_ksettings = dsa_slave_set_link_ksettings,
.get_rxnfc = dsa_slave_get_rxnfc,
.set_rxnfc = dsa_slave_set_rxnfc,
.get_ts_info = dsa_slave_get_ts_info,
@@ -1052,56 +1060,122 @@ static struct device_type dsa_type = {
.name = "dsa",
};
-static void dsa_slave_adjust_link(struct net_device *dev)
+static void dsa_slave_phylink_validate(struct net_device *dev,
+ unsigned long *supported,
+ struct phylink_link_state *state)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = dp->ds;
- unsigned int status_changed = 0;
- if (p->old_link != dev->phydev->link) {
- status_changed = 1;
- p->old_link = dev->phydev->link;
- }
+ if (!ds->ops->phylink_validate)
+ return;
- if (p->old_duplex != dev->phydev->duplex) {
- status_changed = 1;
- p->old_duplex = dev->phydev->duplex;
- }
+ ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
- if (p->old_pause != dev->phydev->pause) {
- status_changed = 1;
- p->old_pause = dev->phydev->pause;
- }
+static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Only called for SGMII and 802.3z */
+ if (!ds->ops->phylink_mac_link_state)
+ return -EOPNOTSUPP;
+
+ return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+}
+
+static void dsa_slave_phylink_mac_config(struct net_device *dev,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_config)
+ return;
+
+ ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
- if (ds->ops->adjust_link && status_changed)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
+static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
- if (status_changed)
- phy_print_status(dev->phydev);
+ if (!ds->ops->phylink_mac_an_restart)
+ return;
+
+ ds->ops->phylink_mac_an_restart(ds, dp->index);
}
-static int dsa_slave_fixed_link_update(struct net_device *dev,
- struct fixed_phy_status *status)
+static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+ unsigned int mode,
+ phy_interface_t interface)
{
- struct dsa_switch *ds;
- struct dsa_port *dp;
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
- if (dev) {
- dp = dsa_slave_to_port(dev);
- ds = dp->ds;
- if (ds->ops->fixed_link_update)
- ds->ops->fixed_link_update(ds, dp->index, status);
+ if (!ds->ops->phylink_mac_link_down) {
+ if (ds->ops->adjust_link && dev->phydev)
+ ds->ops->adjust_link(ds, dp->index, dev->phydev);
+ return;
}
- return 0;
+ ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+
+static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->phylink_mac_link_up) {
+ if (ds->ops->adjust_link && dev->phydev)
+ ds->ops->adjust_link(ds, dp->index, dev->phydev);
+ return;
+ }
+
+ ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+
+static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
+ .validate = dsa_slave_phylink_validate,
+ .mac_link_state = dsa_slave_phylink_mac_link_state,
+ .mac_config = dsa_slave_phylink_mac_config,
+ .mac_an_restart = dsa_slave_phylink_mac_an_restart,
+ .mac_link_down = dsa_slave_phylink_mac_link_down,
+ .mac_link_up = dsa_slave_phylink_mac_link_up,
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+ const struct dsa_port *dp = dsa_to_port(ds, port);
+
+ phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_slave_phylink_fixed_state(struct net_device *dev,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* No need to check that this operation is valid, the callback would
+ * not be called if it was not.
+ */
+ ds->ops->phylink_fixed_state(ds, dp->index, state);
}
/* slave device setup *******************************************************/
static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
struct dsa_switch *ds = dp->ds;
slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
@@ -1110,75 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
return -ENODEV;
}
- /* Use already configured phy mode */
- if (p->phy_interface == PHY_INTERFACE_MODE_NA)
- p->phy_interface = slave_dev->phydev->interface;
-
- return phy_connect_direct(slave_dev, slave_dev->phydev,
- dsa_slave_adjust_link, p->phy_interface);
+ return phylink_connect_phy(dp->pl, slave_dev->phydev);
}
static int dsa_slave_phy_setup(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
struct device_node *port_dn = dp->dn;
struct dsa_switch *ds = dp->ds;
- struct device_node *phy_dn;
- bool phy_is_fixed = false;
u32 phy_flags = 0;
int mode, ret;
mode = of_get_phy_mode(port_dn);
if (mode < 0)
mode = PHY_INTERFACE_MODE_NA;
- p->phy_interface = mode;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
- if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
- /* In the case of a fixed PHY, the DT node associated
- * to the fixed PHY is the Port DT node
- */
- ret = of_phy_register_fixed_link(port_dn);
- if (ret) {
- netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
- return ret;
- }
- phy_is_fixed = true;
- phy_dn = of_node_get(port_dn);
+ dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+ &dsa_slave_phylink_mac_ops);
+ if (IS_ERR(dp->pl)) {
+ netdev_err(slave_dev,
+ "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+ return PTR_ERR(dp->pl);
}
+ /* Register only if the switch provides such a callback, since this
+ * callback takes precedence over polling the link GPIO in PHYLINK
+ * (see phylink_get_fixed_state).
+ */
+ if (ds->ops->phylink_fixed_state)
+ phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
+
if (ds->ops->get_phy_flags)
phy_flags = ds->ops->get_phy_flags(ds, dp->index);
- if (phy_dn) {
- slave_dev->phydev = of_phy_connect(slave_dev, phy_dn,
- dsa_slave_adjust_link,
- phy_flags,
- p->phy_interface);
- of_node_put(phy_dn);
- }
-
- if (slave_dev->phydev && phy_is_fixed)
- fixed_phy_set_link_update(slave_dev->phydev,
- dsa_slave_fixed_link_update);
-
- /* We could not connect to a designated PHY, so use the switch internal
- * MDIO bus instead
- */
- if (!slave_dev->phydev) {
+ ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+ if (ret == -ENODEV) {
+ /* We could not connect to a designated PHY or SFP, so use the
+ * switch internal MDIO bus instead
+ */
ret = dsa_slave_phy_connect(slave_dev, dp->index);
if (ret) {
- netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+ netdev_err(slave_dev,
+ "failed to connect to port %d: %d\n",
dp->index, ret);
- if (phy_is_fixed)
- of_phy_deregister_fixed_link(port_dn);
+ phylink_destroy(dp->pl);
return ret;
}
}
- phy_attached_info(slave_dev->phydev);
-
return 0;
}
@@ -1193,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
int dsa_slave_suspend(struct net_device *slave_dev)
{
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ struct dsa_port *dp = dsa_slave_to_port(slave_dev);
netif_device_detach(slave_dev);
- if (slave_dev->phydev) {
- phy_stop(slave_dev->phydev);
- p->old_pause = -1;
- p->old_link = -1;
- p->old_duplex = -1;
- phy_suspend(slave_dev->phydev);
- }
+ rtnl_lock();
+ phylink_stop(dp->pl);
+ rtnl_unlock();
return 0;
}
int dsa_slave_resume(struct net_device *slave_dev)
{
+ struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+
netif_device_attach(slave_dev);
- if (slave_dev->phydev) {
- phy_resume(slave_dev->phydev);
- phy_start(slave_dev->phydev);
- }
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
return 0;
}
@@ -1280,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port)
p->dp = port;
INIT_LIST_HEAD(&p->mall_tc_list);
p->xmit = cpu_dp->tag_ops->xmit;
-
- p->old_pause = -1;
- p->old_link = -1;
- p->old_duplex = -1;
-
port->slave = slave_dev;
netif_carrier_off(slave_dev);
@@ -1307,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port)
return 0;
out_phy:
- phy_disconnect(slave_dev->phydev);
- if (of_phy_is_fixed_link(port->dn))
- of_phy_deregister_fixed_link(port->dn);
+ rtnl_lock();
+ phylink_disconnect_phy(p->dp->pl);
+ rtnl_unlock();
+ phylink_destroy(p->dp->pl);
out_free:
free_percpu(p->stats64);
free_netdev(slave_dev);
@@ -1321,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
struct dsa_slave_priv *p = netdev_priv(slave_dev);
- struct device_node *port_dn = dp->dn;
netif_carrier_off(slave_dev);
- if (slave_dev->phydev) {
- phy_disconnect(slave_dev->phydev);
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
- if (of_phy_is_fixed_link(port_dn))
- of_phy_deregister_fixed_link(port_dn);
- }
dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
unregister_netdev(slave_dev);
+ phylink_destroy(dp->pl);
free_percpu(p->stats64);
free_netdev(slave_dev);
}
@@ -1394,6 +1438,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
switch (switchdev_work->event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
fdb_info = &switchdev_work->fdb_info;
+ if (!fdb_info->added_by_user)
+ break;
+
err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
if (err) {
netdev_dbg(dev, "fdb add failed err=%d\n", err);
@@ -1405,6 +1452,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
case SWITCHDEV_FDB_DEL_TO_DEVICE:
fdb_info = &switchdev_work->fdb_info;
+ if (!fdb_info->added_by_user)
+ break;
+
err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
if (err) {
netdev_dbg(dev, "fdb del failed err=%d\n", err);
@@ -1457,8 +1507,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
case SWITCHDEV_FDB_DEL_TO_DEVICE:
- if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
- ptr))
+ if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
goto err_fdb_work_init;
dev_hold(dev);
break;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
{
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
- struct flow_keys keys;
+ struct flow_keys_basic keys;
/* this should never happen, but better safe than sorry */
if (unlikely(len < sizeof(*eth)))
return len;
/* parse any remaining L2/L3 headers, check for L4 */
- if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
- sizeof(*eth), len, flags))
+ if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
+ sizeof(*eth), len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,7 +13,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
- inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+ metrics.o netlink.o
+
+obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a59428e63ab..15e125558c76 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
@@ -1006,6 +1008,7 @@ const struct proto_ops inet_stream_ops = {
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..5e04ed25bc0e
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+static int bpfilter_mbox_request(struct sock *sk, int optname,
+ char __user *optval,
+ unsigned int optlen, bool is_set)
+{
+ if (!bpfilter_process_sockopt) {
+ int err = request_module("bpfilter");
+
+ if (err)
+ return err;
+ if (!bpfilter_process_sockopt)
+ return -ECHILD;
+ }
+ return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+ unsigned int optlen)
+{
+ return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+ int __user *optlen)
+{
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
+ [IFA_RT_PRIORITY] = { .type = NLA_U32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_RT_PRIORITY])
+ ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
extack);
} else {
+ u32 new_metric = ifa->ifa_rt_priority;
+
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
ifa = ifa_existing;
+
+ if (ifa->ifa_rt_priority != new_metric) {
+ fib_modify_prefix_metric(ifa, new_metric);
+ ifa->ifa_rt_priority = new_metric;
+ }
+
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ (ifa->ifa_rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
preferred, valid))
goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e66172aaf241..63aa39b3af03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.fl4_dport = 0;
}
- trace_fib_validate_source(dev, &fl4);
-
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
@@ -650,6 +648,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_UID] = { .type = NLA_U32 },
[RTA_MARK] = { .type = NLA_U32 },
[RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_IP_PROTO] = { .type = NLA_U8 },
+ [RTA_SPORT] = { .type = NLA_U16 },
+ [RTA_DPORT] = { .type = NLA_U16 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -846,7 +847,8 @@ out_err:
* to fib engine. It is legal, because all events occur
* only when netlink is already locked.
*/
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+ struct in_ifaddr *ifa, u32 rt_priority)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -856,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
+ .fc_priority = rt_priority,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -901,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
}
}
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ prefix, ifa->ifa_prefixlen, prim,
+ ifa->ifa_rt_priority);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+ prim, 0);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
- 32, prim);
+ 32, prim, 0);
}
}
}
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+ __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+
+ if (!(dev->flags & IFF_UP) ||
+ ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+ ipv4_is_zeronet(prefix) ||
+ prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+ return;
+
+ /* add the new */
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+ /* delete the old */
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
/* Delete primary or secondary address.
* Optionally, on secondary address promotion consider the addresses
* from subnet iprim as deleted, even if they are in device list.
@@ -967,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_DELROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ any, ifa->ifa_prefixlen, prim, 0);
subnet = 1;
}
@@ -1051,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
no_promotions:
if (!(ok & BRD_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (subnet && ifa->ifa_prefixlen < 31) {
if (!(ok & BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+ prim, 0);
if (!(ok & BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+ prim, 0);
}
if (!(ok & LOCAL_OK)) {
unsigned int addr_type;
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
/* Check, that this local address finally disappeared. */
addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- if (frh->tos & ~IPTOS_TOS_MASK)
+ if (frh->tos & ~IPTOS_TOS_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
+ }
/* split local/main if they are not already split */
err = fib_unmerge(net);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c27122f01b87..f3c89ccf14c5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -717,6 +717,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
} else {
+ if (nla_len(nla) != sizeof(u32))
+ return false;
val = nla_get_u32(nla);
}
@@ -1019,47 +1021,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
static int
fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
{
- bool ecn_ca = false;
- struct nlattr *nla;
- int remaining;
-
- if (!cfg->fc_mx)
- return 0;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
- u32 val;
-
- if (!type)
- continue;
- if (type > RTAX_MAX)
- return -EINVAL;
-
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
- return -EINVAL;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_ADVMSS && val > 65535 - 40)
- val = 65535 - 40;
- if (type == RTAX_MTU && val > 65535 - 15)
- val = 65535 - 15;
- if (type == RTAX_HOPLIMIT && val > 255)
- val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
- return -EINVAL;
- fi->fib_metrics->metrics[type - 1] = val;
- }
-
- if (ecn_ca)
- fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
-
- return 0;
+ return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
+ fi->fib_metrics->metrics);
}
struct fib_info *fib_create_info(struct fib_config *cfg,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 99c23a0cb8ca..5bc0c89e81e4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
unsigned long index;
t_key cindex;
- trace_fib_table_lookup(tb->tb_id, flp);
-
pn = t->kv;
cindex = 0;
n = get_child_rcu(pn, cindex);
- if (!n)
+ if (!n) {
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ backtrace:
* nothing for us to do as we do not have any
* further nodes to parse.
*/
- if (IS_TRIE(pn))
+ if (IS_TRIE(pn)) {
+ trace_fib_table_lookup(tb->tb_id, flp,
+ NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
@@ -1459,6 +1462,7 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
return err;
}
if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ found:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
- trace_fib_table_lookup_nh(nh);
+ trace_fib_table_lookup(tb->tb_id, flp, nh, err);
return err;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 881ac6d046f2..33a88e045efd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,11 +27,6 @@
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
-#ifdef INET_CSK_DEBUG
-const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
-EXPORT_SYMBOL(inet_csk_timer_bug_msg);
-#endif
-
#if IS_ENABLED(CONFIG_IPV6)
/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
* only, and any IPv4 addresses if not IPv6 only
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f200b304f76c..2d8efeecf619 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -578,6 +578,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
int tunnel_hlen;
int version;
__be16 df;
+ int nhoff;
+ int thoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -605,6 +607,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+ truncate = true;
+
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
ntohl(md->u.index), truncate, true);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d54abc097800..af5a830ff6ad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ bool paged;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
+
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
@@ -906,8 +909,8 @@ static int __ip_append_data(struct sock *sk,
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
- !(flags & MSG_MORE) &&
- !exthdrlen)
+ (!(flags & MSG_MORE) || cork->gso_size) &&
+ (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
cork->length += length;
@@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -953,8 +957,12 @@ alloc_new_skb:
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
+ else if (!paged)
alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += exthdrlen;
@@ -998,7 +1006,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen + exthdrlen);
+ data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
@@ -1014,7 +1022,7 @@ alloc_new_skb:
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
@@ -1022,7 +1030,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
@@ -1136,6 +1144,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
*rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+ cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
@@ -1215,7 +1225,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = cork->fragsize;
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -1471,9 +1481,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
- unsigned int flags)
+ struct inet_cork *cork, unsigned int flags)
{
- struct inet_cork cork;
struct sk_buff_head queue;
int err;
@@ -1482,22 +1491,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.flags = 0;
- cork.addr = 0;
- cork.opt = NULL;
- err = ip_setup_cork(sk, &cork, ipc, rtp);
+ cork->flags = 0;
+ cork->addr = 0;
+ cork->opt = NULL;
+ err = ip_setup_cork(sk, cork, ipc, rtp);
if (err)
return ERR_PTR(err);
- err = __ip_append_data(sk, fl4, &queue, &cork,
+ err = __ip_append_data(sk, fl4, &queue, cork,
&current->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
- __ip_flush_pending_frames(sk, &queue, &cork);
+ __ip_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}
- return __ip_make_skb(sk, fl4, &queue, &cork);
+ return __ip_make_skb(sk, fl4, &queue, cork);
}
/*
@@ -1553,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
oif = skb->skb_iif;
flowi4_init_output(&fl4, oif,
- IP4_REPLY_MARK(net, skb->mark),
+ IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 57bbb060faaf..fc32fdbeefa6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
#include <linux/errqueue.h>
#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+
/*
* SOL_IP control messages.
*/
@@ -1242,6 +1244,11 @@ int ip_setsockopt(struct sock *sk, int level,
return -ENOPROTOOPT;
err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+ optname < BPFILTER_IPT_SET_MAX)
+ err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1550,6 +1557,11 @@ int ip_getsockopt(struct sock *sk, int level,
int err;
err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+ optname < BPFILTER_IPT_GET_MAX)
+ err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1582,6 +1594,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
err = do_ip_getsockopt(sk, level, optname, optval, optlen,
MSG_CMSG_COMPAT);
+#ifdef CONFIG_BPFILTER
+ if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+ optname < BPFILTER_IPT_GET_MAX)
+ err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..dde671e97829 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void)
lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
}
-struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
void ip_tunnel_need_metadata(void)
{
- static_key_slow_inc(&ip_tunnel_metadata_cnt);
+ static_branch_inc(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
void ip_tunnel_unneed_metadata(void)
{
- static_key_slow_dec(&ip_tunnel_metadata_cnt);
+ static_branch_dec(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bbcbcc113d19..88212615bf4c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
*
* Multiple Nameservers in /proc/net/pnp
* -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ * NTP servers in /proc/net/ipconfig/ntp_servers
+ * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018
*/
#include <linux/types.h>
@@ -93,6 +96,7 @@
#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
- '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */
#define NONE cpu_to_be32(INADDR_NONE)
#define ANY cpu_to_be32(INADDR_ANY)
@@ -152,6 +156,7 @@ static int ic_proto_used; /* Protocol used, if any */
#define ic_proto_used 0
#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
/*
@@ -576,6 +581,15 @@ static inline void __init ic_nameservers_predef(void)
ic_nameservers[i] = NONE;
}
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+ ic_ntp_servers[i] = NONE;
+}
+
/*
* DHCP/BOOTP support.
*/
@@ -671,6 +685,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
17, /* Boot path */
26, /* MTU */
40, /* NIS domain name */
+ 42, /* NTP servers */
};
*e++ = 55; /* Parameter request list */
@@ -721,9 +736,11 @@ static void __init ic_bootp_init_ext(u8 *e)
*e++ = 3; /* Default gateway request */
*e++ = 4;
e += 4;
- *e++ = 5; /* Name server request */
- *e++ = 8;
- e += 8;
+#if CONF_NAMESERVERS_MAX > 0
+ *e++ = 6; /* (DNS) name server request */
+ *e++ = 4 * CONF_NAMESERVERS_MAX;
+ e += 4 * CONF_NAMESERVERS_MAX;
+#endif
*e++ = 12; /* Host name request */
*e++ = 32;
e += 32;
@@ -748,7 +765,13 @@ static void __init ic_bootp_init_ext(u8 *e)
*/
static inline void __init ic_bootp_init(void)
{
+ /* Re-initialise all name servers and NTP servers to NONE, in case any
+ * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+ * any IP addresses specified there will already have been decoded but
+ * are no longer needed
+ */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
dev_add_pack(&bootp_packet_type);
}
@@ -912,6 +935,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
ic_bootp_string(utsname()->domainname, ext+1, *ext,
__NEW_UTS_LEN);
break;
+ case 42: /* NTP servers */
+ servers = *ext / 4;
+ if (servers > CONF_NTP_SERVERS_MAX)
+ servers = CONF_NTP_SERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_ntp_servers[i] == NONE)
+ memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+ }
+ break;
}
}
@@ -1257,7 +1289,10 @@ static int __init ic_dynamic(void)
#endif /* IPCONFIG_DYNAMIC */
#ifdef CONFIG_PROC_FS
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+/* Name servers: */
static int pnp_seq_show(struct seq_file *seq, void *v)
{
int i;
@@ -1282,6 +1317,62 @@ static int pnp_seq_show(struct seq_file *seq, void *v)
&ic_servaddr);
return 0;
}
+
+/* Create the /proc/net/ipconfig directory */
+static int __init ipconfig_proc_net_init(void)
+{
+ ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+ const struct file_operations *fops)
+{
+ char *pname;
+ struct proc_dir_entry *p;
+
+ if (!ipconfig_dir)
+ return -ENOMEM;
+
+ pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+ if (!pname)
+ return -ENOMEM;
+
+ p = proc_create(pname, 0444, init_net.proc_net, fops);
+ kfree(pname);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE)
+ seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+ }
+ return 0;
+}
+
+static int ntp_servers_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ntp_servers_seq_show, NULL);
+}
+
+static const struct file_operations ntp_servers_seq_fops = {
+ .open = ntp_servers_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_PROC_FS */
/*
@@ -1356,8 +1447,20 @@ static int __init ip_auto_config(void)
int err;
unsigned int i;
+ /* Initialise all name servers and NTP servers to NONE (but only if the
+ * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+ * otherwise we'll overwrite the IP addresses specified there)
+ */
+ if (ic_set_manually == 0) {
+ ic_nameservers_predef();
+ ic_ntp_servers_predef();
+ }
+
#ifdef CONFIG_PROC_FS
proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
+
+ if (ipconfig_proc_net_init() == 0)
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1469,16 +1572,32 @@ static int __init ip_auto_config(void)
&ic_servaddr, &root_server_addr, root_server_path);
if (ic_dev_mtu)
pr_cont(", mtu=%d", ic_dev_mtu);
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ /* Name servers (if any): */
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE) {
- pr_cont(" nameserver%u=%pI4",
- i, &ic_nameservers[i]);
- break;
+ if (i == 0)
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ else
+ pr_cont(", nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
}
- for (i++; i < CONF_NAMESERVERS_MAX; i++)
- if (ic_nameservers[i] != NONE)
- pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
- pr_cont("\n");
+ if (i + 1 == CONF_NAMESERVERS_MAX)
+ pr_cont("\n");
+ }
+ /* NTP servers (if any): */
+ for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ if (ic_ntp_servers[i] != NONE) {
+ if (i == 0)
+ pr_info(" ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ else
+ pr_cont(", ntpserver%u=%pI4",
+ i, &ic_ntp_servers[i]);
+ }
+ if (i + 1 == CONF_NTP_SERVERS_MAX)
+ pr_cont("\n");
+ }
#endif /* !SILENT */
/*
@@ -1576,7 +1695,9 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+ /* Initialise all name servers and NTP servers to NONE */
ic_nameservers_predef();
+ ic_ntp_servers_predef();
/* Parse string for static IP assignment. */
ip = addrs;
@@ -1635,6 +1756,13 @@ static int __init ip_auto_config_setup(char *addrs)
ic_nameservers[1] = NONE;
}
break;
+ case 9:
+ if (CONF_NTP_SERVERS_MAX >= 1) {
+ ic_ntp_servers[0] = in_aton(ip);
+ if (ic_ntp_servers[0] == ANY)
+ ic_ntp_servers[0] = NONE;
+ }
+ break;
}
}
ip = cp;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 37c4f885ff7b..9f79b9803a16 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
};
static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
struct net *net))
{
struct mr_table *mrt;
+ int err;
mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
if (!mrt)
- return NULL;
+ return ERR_PTR(-ENOMEM);
mrt->id = id;
write_pnet(&mrt->net, net);
mrt->ops = *ops;
- if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) {
+ err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+ if (err) {
kfree(mrt);
- return NULL;
+ return ERR_PTR(err);
}
INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..04311f7067e2
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,55 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+ u32 *metrics)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ if (!fc_mx)
+ return 0;
+
+ nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return -EINVAL;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ if (nla_len(nla) != sizeof(u32))
+ return -EINVAL;
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ return -EINVAL;
+ metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_metrics_convert);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 280048e1e395..bbfc356cb1b5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
This option enables the IPv4 socket lookup infrastructure. This is
- is required by the iptables socket match.
+ is required by the {ip,nf}tables socket match.
+
+config NF_TPROXY_IPV4
+ tristate "IPv4 tproxy support"
if NF_TABLES
@@ -129,10 +132,7 @@ config NFT_CHAIN_NAT_IPV4
source and destination ports.
config NF_NAT_MASQUERADE_IPV4
- tristate "IPv4 masquerade support"
- help
- This is the kernel functionality to provide NAT in the masquerade
- flavour (automatic source address selection).
+ bool
config NFT_MASQ_IPV4
tristate "IPv4 masquerading support for nf_tables"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0e5edd0c7926..8394c17c269f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,12 +10,14 @@ nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
# logging
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
@@ -32,9 +34,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-
-
# NAT protocols (nf_nat)
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e85f35b89c49..38ab97b0a2ec 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,7 +301,7 @@ ipt_do_table(struct sk_buff *skb,
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
- t = ipt_get_target(e);
+ t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
@@ -1783,6 +1783,8 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
/* set res now, will see skbs right after nf_register_net_hooks */
WRITE_ONCE(*res, new_table);
+ if (!ops)
+ return 0;
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
@@ -1800,7 +1802,8 @@ out_free:
void ipt_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ops)
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
__ipt_unregister_table(net, table);
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
mr = par->targinfo;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..a317445448bf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -33,75 +33,63 @@ static const struct xt_table nf_nat_ipv4_table = {
static unsigned int iptable_nat_do_chain(void *priv,
struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
-{
- return ipt_do_table(skb, state, state->net->ipv4.nat_table);
-}
-
-static unsigned int iptable_nat_ipv4_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_out(void *priv,
- struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_local_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
+ return ipt_do_table(skb, state, state->net->ipv4.nat_table);
}
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
- /* Before packet filtering, change destination */
{
- .hook = iptable_nat_ipv4_in,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = iptable_nat_ipv4_out,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
- /* Before packet filtering, change destination */
{
- .hook = iptable_nat_ipv4_local_fn,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = iptable_nat_ipv4_fn,
+ .hook = iptable_nat_do_chain,
.pf = NFPROTO_IPV4,
- .nat_hook = true,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
+static int ipt_nat_register_lookups(struct net *net)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+ ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+ if (ret) {
+ while (i)
+ nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+ nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+}
+
static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
@@ -114,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net)
if (repl == NULL)
return -ENOMEM;
ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
- nf_nat_ipv4_ops, &net->ipv4.nat_table);
+ NULL, &net->ipv4.nat_table);
+ if (ret < 0) {
+ kfree(repl);
+ return ret;
+ }
+
+ ret = ipt_nat_register_lookups(net);
+ if (ret < 0) {
+ ipt_unregister_table(net, net->ipv4.nat_table, NULL);
+ net->ipv4.nat_table = NULL;
+ }
+
kfree(repl);
return ret;
}
@@ -123,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
{
if (!net->ipv4.nat_table)
return;
- ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+ ipt_nat_unregister_lookups(net);
+ ipt_unregister_table(net, net->ipv4.nat_table, NULL);
net->ipv4.nat_table = NULL;
}
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,265 +2,12 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
-}
-
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace4(&udph->check, skb, addr,
- new_addr, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
-{
- switch (iph->protocol) {
- case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- default:
- return -1;
- }
- csum_replace4(&iph->check, addr, new_addr);
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- default:
- return -1;
- }
- csum_replace4(&iph->check, addr, new_addr);
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
-{
- struct iphdr *iph = ip_hdr(skb);
- unsigned int thoff = iph->ihl * 4;
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
-}
-
-static bool ip_has_options(unsigned int thoff)
-{
- return thoff != sizeof(struct iphdr);
-}
-
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
-{
- struct flow_ports *ports;
- unsigned int thoff;
- struct iphdr *iph;
-
- if (!pskb_may_pull(skb, sizeof(*iph)))
- return -1;
-
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
-
- if (ip_is_fragment(iph) ||
- unlikely(ip_has_options(thoff)))
- return -1;
-
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- return -1;
-
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
- return -1;
-
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
- tuple->src_v4.s_addr = iph->saddr;
- tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
- tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
- tuple->iifidx = dev->ifindex;
-
- return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
- if (skb->len <= mtu)
- return false;
-
- if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
- return false;
-
- if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
- return false;
-
- return true;
-}
-
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
- u32 mtu;
-
- mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (__nf_flow_exceeds_mtu(skb, mtu))
- return true;
-
- return false;
-}
-
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- const struct rtable *rt;
- struct iphdr *iph;
- __be32 nexthop;
-
- if (skb->protocol != htons(ETH_P_IP))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
-
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
- rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
-
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ip(flow, skb, dir) < 0)
- return NF_DROP;
-
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- iph = ip_hdr(skb);
- ip_decrease_ttl(iph);
-
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-
- return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
static void ip_nat_q931_expect(struct nf_conn *new,
struct nf_conntrack_expect *this)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
static void ip_nat_callforwarding_expect(struct nf_conn *new,
struct nf_conntrack_expect *this)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..6115bf1ff6f0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
#endif /* CONFIG_XFRM */
static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
if (tb[CTA_NAT_V4_MINIP]) {
range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
@@ -241,34 +241,18 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-unsigned int
+static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- /* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
ct = nf_ct_get(skb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- * have dropped it. Hence it's the user's responsibilty to
- * packet filter it out, or implement conntrack/NAT for that
- * protocol. 8) --RR
- */
if (!ct)
return NF_ACCEPT;
- nat = nfct_nat(ct);
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
+ if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
@@ -276,78 +260,30 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
else
return NF_ACCEPT;
}
- /* Only ICMPs can be IP_CT_IS_REPLY: */
- /* fall through */
- case IP_CT_NEW:
- /* Seen it before? This can happen for loopback, retrans,
- * or local packets.
- */
- if (!nf_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- ret = do_chain(priv, skb, state, ct);
- if (ret != NF_ACCEPT)
- return ret;
-
- if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
- break;
-
- ret = nf_nat_alloc_null_binding(ct, state->hook);
- if (ret != NF_ACCEPT)
- return ret;
- } else {
- pr_debug("Already setup manip %s for ct %p\n",
- maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat,
- state->out))
- goto oif_changed;
- }
- break;
-
- default:
- /* ESTABLISHED */
- WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
- goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_DROP;
+ return nf_nat_inet_fn(priv, skb, state);
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
-unsigned int
+static unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
skb_dst_drop(skb);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
-unsigned int
+static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
@@ -356,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
#endif
unsigned int ret;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -376,22 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
#endif
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
-unsigned int
+static unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
int err;
- ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv4_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -415,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_out,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_local_fn,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_fn,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
+
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
static int __init nf_nat_l3proto_ipv4_init(void)
{
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..ad3aeff152ed 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -7,7 +7,6 @@
*/
#include <linux/types.h>
-#include <linux/module.h>
#include <linux/atomic.h>
#include <linux/inetdevice.h>
#include <linux/ip.h>
@@ -24,13 +23,13 @@
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
const struct net_device *out)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
const struct rtable *rt;
__be32 newsrc, nh;
@@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void)
unregister_inetaddr_notifier(&masq_inet_notifier);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
struct nf_conn_nat *nat;
nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
static void
gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
static void
icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..805e83ec3ad9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+ __be32 laddr, __be16 lport, struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+
+ return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+ const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+ struct tcphdr *tcph;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NF_TPROXY_LOOKUP_LISTENER:
+ tcph = hp;
+ sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+ ip_hdrlen(skb) +
+ __tcp_hdrlen(tcph),
+ saddr, sport,
+ daddr, dport,
+ in->ifindex, 0);
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NF_TPROXY_LOOKUP_ESTABLISHED:
+ sk = inet_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+ (!connected || wildcard)) ||
+ (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+ protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5464a3f253b..a3c4ea303e3e 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -27,9 +27,8 @@
#include <net/ip.h>
static unsigned int nft_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
@@ -39,42 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv,
return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv4_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_out(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_local_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static int nft_nat_ipv4_init(struct nft_ctx *ctx)
+static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
{
- return nf_ct_netns_get(ctx->net, ctx->family);
+ return nf_nat_l3proto_ipv4_register_fn(net, ops);
}
-static void nft_nat_ipv4_free(struct nft_ctx *ctx)
+static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
{
- nf_ct_netns_put(ctx->net, ctx->family);
+ nf_nat_l3proto_ipv4_unregister_fn(net, ops);
}
static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -87,13 +58,13 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
- [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
- [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
- [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
},
- .init = nft_nat_ipv4_init,
- .free = nft_nat_ipv4_free,
+ .ops_register = nft_nat_ipv4_reg,
+ .ops_unregister = nft_nat_ipv4_unreg,
};
static int __init nft_chain_nat_init(void)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+ struct netlink_ext_ack *extack)
+{
+ *ip_proto = nla_get_u8(attr);
+
+ switch (*ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ICMP:
+ return 0;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 573e43c8ed87..77350c1256ce 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -284,6 +284,9 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+ SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+ SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+ SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 75fb8864be67..bf4e4adc2d00 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1341,6 +1341,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
return NULL;
}
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+ struct net_device *dev = nh->nh_dev;
+ u32 mtu = 0;
+
+ if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+ fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+ mtu = fi->fib_mtu;
+
+ if (likely(!mtu)) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = find_exception(nh, daddr);
+ if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+ mtu = fnhe->fnhe_pmtu;
+ }
+
+ if (likely(!mtu))
+ mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr, const bool do_cache)
{
@@ -2563,11 +2594,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
- struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+ struct sk_buff *skb, u32 portid, u32 seq)
{
- struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
struct nlmsghdr *nlh;
unsigned long expires = 0;
@@ -2663,7 +2693,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
}
} else
#endif
- if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
goto nla_put_failure;
}
@@ -2678,43 +2708,93 @@ nla_put_failure:
return -EMSGSIZE;
}
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+ u8 ip_proto, __be16 sport,
+ __be16 dport)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ /* Reserve room for dummy headers, this skb can pass
+ * through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ iph = skb_put(skb, sizeof(struct iphdr));
+ iph->protocol = ip_proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->version = 0x4;
+ iph->frag_off = 0;
+ iph->ihl = 0x5;
+ skb_set_transport_header(skb, skb->len);
+
+ switch (iph->protocol) {
+ case IPPROTO_UDP: {
+ struct udphdr *udph;
+
+ udph = skb_put_zero(skb, sizeof(struct udphdr));
+ udph->source = sport;
+ udph->dest = dport;
+ udph->len = sizeof(struct udphdr);
+ udph->check = 0;
+ break;
+ }
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph;
+
+ tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+ tcph->source = sport;
+ tcph->dest = dport;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+ src, dst, 0);
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr *icmph;
+
+ icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+ icmph->type = ICMP_ECHO;
+ icmph->code = 0;
+ }
+ }
+
+ return skb;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
- struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
+ u32 table_id = RT_TABLE_MAIN;
+ __be16 sport = 0, dport = 0;
struct fib_result res = {};
+ u8 ip_proto = IPPROTO_UDP;
struct rtable *rt = NULL;
+ struct sk_buff *skb;
+ struct rtmsg *rtm;
struct flowi4 fl4;
__be32 dst = 0;
__be32 src = 0;
+ kuid_t uid;
u32 iif;
int err;
int mark;
- struct sk_buff *skb;
- u32 table_id = RT_TABLE_MAIN;
- kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
extack);
if (err < 0)
- goto errout;
+ return err;
rtm = nlmsg_data(nlh);
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb) {
- err = -ENOBUFS;
- goto errout;
- }
-
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb_reset_mac_header(skb);
- skb_reset_network_header(skb);
-
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2724,14 +2804,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
else
uid = (iif ? INVALID_UID : current_uid());
- /* Bugfix: need to give ip_route_input enough of an IP header to
- * not gag.
- */
- ip_hdr(skb)->protocol = IPPROTO_UDP;
- ip_hdr(skb)->saddr = src;
- ip_hdr(skb)->daddr = dst;
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &ip_proto, extack);
+ if (err)
+ return err;
+ }
- skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+ if (tb[RTA_SPORT])
+ sport = nla_get_be16(tb[RTA_SPORT]);
+
+ if (tb[RTA_DPORT])
+ dport = nla_get_be16(tb[RTA_DPORT]);
+
+ skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+ if (!skb)
+ return -ENOBUFS;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2740,6 +2828,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
+ if (sport)
+ fl4.fl4_sport = sport;
+ if (dport)
+ fl4.fl4_dport = dport;
+ fl4.flowi4_proto = ip_proto;
rcu_read_lock();
@@ -2749,10 +2842,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
err = -ENODEV;
- goto errout_free;
+ goto errout_rcu;
}
- skb->protocol = htons(ETH_P_IP);
+ fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2772,7 +2865,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
if (err)
- goto errout_free;
+ goto errout_rcu;
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
@@ -2780,34 +2873,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = res.table ? res.table->tb_id : 0;
+ /* reset skb for netlink reply msg */
+ skb_trim(skb, 0);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_header(skb);
+
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
- goto errout_free;
+ goto errout_rcu;
}
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);
} else {
- err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+ err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
}
if (err < 0)
- goto errout_free;
+ goto errout_rcu;
rcu_read_unlock();
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
- return err;
errout_free:
+ return err;
+errout_rcu:
rcu_read_unlock();
kfree_skb(skb);
- goto errout;
+ goto errout_free;
}
void ip_rt_multicast_event(struct in_device *in_dev)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b195bac8ac0..d06247ba08b2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -30,6 +30,7 @@
static int zero;
static int one = 1;
+static int two = 2;
static int four = 4;
static int thousand = 1000;
static int gso_max_segs = GSO_MAX_SEGS;
@@ -46,6 +47,7 @@ static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int comp_sack_nr_max = 255;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -844,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = {
.data = &init_net.ipv4.sysctl_tcp_tw_reuse,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "tcp_max_tw_buckets",
@@ -1152,6 +1156,22 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = &one,
},
{
+ .procname = "tcp_comp_sack_delay_ns",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_comp_sack_nr",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &comp_sack_nr_max,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dec47e6789e7..2741953adaba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1691,6 +1691,139 @@ int tcp_peek_len(struct socket *sock)
}
EXPORT_SYMBOL(tcp_peek_len);
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+ sk->sk_rcvlowat = val ? : 1;
+
+ /* Check if we need to signal EPOLLIN right now */
+ tcp_data_ready(sk);
+
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ return 0;
+
+ /* val comes from user space and might be close to INT_MAX */
+ val <<= 1;
+ if (val < 0)
+ val = INT_MAX;
+
+ val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ if (val > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = val;
+ tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_set_rcvlowat);
+
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
+int tcp_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+ /* Instruct vm_insert_page() to not down_read(mmap_sem) */
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ vma->vm_ops = &tcp_vm_ops;
+ return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long address = (unsigned long)zc->address;
+ const skb_frag_t *frags = NULL;
+ u32 length = 0, seq, offset;
+ struct vm_area_struct *vma;
+ struct sk_buff *skb = NULL;
+ struct tcp_sock *tp;
+ int ret;
+
+ if (address & (PAGE_SIZE - 1) || address != zc->address)
+ return -EINVAL;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ sock_rps_record_flow(sk);
+
+ down_read(&current->mm->mmap_sem);
+
+ ret = -EINVAL;
+ vma = find_vma(current->mm, address);
+ if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
+ goto out;
+ zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
+ tp = tcp_sk(sk);
+ seq = tp->copied_seq;
+ zc->length = min_t(u32, zc->length, tcp_inq(sk));
+ zc->length &= ~(PAGE_SIZE - 1);
+
+ zap_page_range(vma, address, zc->length);
+
+ zc->recv_skip_hint = 0;
+ ret = 0;
+ while (length + PAGE_SIZE <= zc->length) {
+ if (zc->recv_skip_hint < PAGE_SIZE) {
+ if (skb) {
+ skb = skb->next;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, seq, &offset);
+ }
+
+ zc->recv_skip_hint = skb->len - offset;
+ offset -= skb_headlen(skb);
+ if ((int)offset < 0 || skb_has_frag_list(skb))
+ break;
+ frags = skb_shinfo(skb)->frags;
+ while (offset) {
+ if (frags->size > offset)
+ goto out;
+ offset -= frags->size;
+ frags++;
+ }
+ }
+ if (frags->size != PAGE_SIZE || frags->page_offset)
+ break;
+ ret = vm_insert_page(vma, address + length,
+ skb_frag_page(frags));
+ if (ret)
+ break;
+ length += PAGE_SIZE;
+ seq += PAGE_SIZE;
+ zc->recv_skip_hint -= PAGE_SIZE;
+ frags++;
+ }
+out:
+ up_read(&current->mm->mmap_sem);
+ if (length) {
+ tp->copied_seq = seq;
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, length);
+ ret = 0;
+ if (length == zc->length)
+ zc->recv_skip_hint = 0;
+ } else {
+ if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+ ret = -EIO;
+ }
+ zc->length = length;
+ return ret;
+}
+#endif
+
static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
{
@@ -1746,6 +1879,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
}
}
+static int tcp_inq_hint(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied_seq = READ_ONCE(tp->copied_seq);
+ u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+ int inq;
+
+ inq = rcv_nxt - copied_seq;
+ if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+ lock_sock(sk);
+ inq = tp->rcv_nxt - tp->copied_seq;
+ release_sock(sk);
+ }
+ return inq;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -1762,13 +1911,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
u32 peek_seq;
u32 *seq;
unsigned long used;
- int err;
+ int err, inq;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
struct scm_timestamping tss;
bool has_tss = false;
+ bool has_cmsg;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -1783,6 +1933,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN)
goto out;
+ has_cmsg = tp->recvmsg_inq;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -1969,6 +2120,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
has_tss = true;
+ has_cmsg = true;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
@@ -1988,13 +2140,20 @@ skip_copy:
* on connected socket. I was just happy when found this 8) --ANK
*/
- if (has_tss)
- tcp_recv_timestamp(msg, sk, &tss);
-
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
+
+ if (has_cmsg) {
+ if (has_tss)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (tp->recvmsg_inq) {
+ inq = tcp_inq_hint(sk);
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
+ }
+
return copied;
out:
@@ -2411,6 +2570,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
+ tp->delivered_ce = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -2424,6 +2584,7 @@ int tcp_disconnect(struct sock *sk, int flags)
dst_release(sk->sk_rx_dst);
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
+ tp->compressed_ack = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2862,6 +3023,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
+ case TCP_INQ:
+ if (val > 1 || val < 0)
+ err = -EINVAL;
+ else
+ tp->recvmsg_inq = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3020,6 +3187,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
rate64 = tcp_compute_delivery_rate(tp);
if (rate64)
info->tcpi_delivery_rate = rate64;
+ info->tcpi_delivered = tp->delivered;
+ info->tcpi_delivered_ce = tp->delivered_ce;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3033,7 +3202,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
u32 rate;
stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 5 * nla_total_size(sizeof(u32)) +
+ 7 * nla_total_size(sizeof(u32)) +
3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3064,9 +3233,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+ nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+ nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
return stats;
}
@@ -3282,6 +3454,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
+ case TCP_INQ:
+ val = tp->recvmsg_inq;
+ break;
case TCP_SAVE_SYN:
val = tp->save_syn;
break;
@@ -3318,6 +3493,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
}
return 0;
}
+#ifdef CONFIG_MMU
+ case TCP_ZEROCOPY_RECEIVE: {
+ struct tcp_zerocopy_receive zc;
+ int err;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len != sizeof(zc))
+ return -EINVAL;
+ if (copy_from_user(&zc, optval, len))
+ return -EFAULT;
+ lock_sock(sk);
+ err = tcp_zerocopy_receive(sk, &zc);
+ release_sock(sk);
+ if (!err && copy_to_user(optval, &zc, len))
+ err = -EFAULT;
+ return err;
+ }
+#endif
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e51c644484dc..355d3dffd021 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+ void (*cad)(struct sock *sk, u32 ack_seq))
+{
+ icsk->icsk_clean_acked = cad;
+ static_branch_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+ static_branch_dec(&clean_acked_data_enabled);
+ icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -184,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
}
}
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
+ quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick)
- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_incr_quickack(sk);
+
+ tcp_incr_quickack(sk, max_quickacks);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
@@ -233,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -242,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 1);
break;
case INET_ECN_CE:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 1);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
- if (tp->ecn_flags & TCP_ECN_OK)
- __tcp_ecn_check_ce(tp, skb);
+ if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(sk, skb);
}
static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -582,6 +605,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
u32 copied;
int time;
+ trace_tcp_rcv_space_adjust(sk);
+
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
@@ -665,7 +690,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
@@ -681,13 +706,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -1896,19 +1921,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
tp->undo_retrans = tp->retrans_out ? : -1;
}
-/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+static bool tcp_is_rack(const struct sock *sk)
+{
+ return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
+/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb, *head;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ head = tcp_rtx_queue_head(sk);
+ is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ /* Mark SACK reneging until we recover from this loss event. */
+ tp->is_sack_reneg = 1;
+ } else if (tcp_is_reno(tp)) {
+ tcp_reset_reno_sack(tp);
+ }
+
+ skb = head;
+ skb_rbtree_walk_from(skb) {
+ if (is_reneg)
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ else if (tcp_is_rack(sk) && skb != head &&
+ tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ continue; /* Don't mark recently sent ones lost yet */
+ tcp_mark_skb_lost(sk, skb);
+ }
+ tcp_verify_left_out(tp);
+ tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
- bool is_reneg; /* is receiver reneging on SACKs? */
- bool mark_lost;
+
+ tcp_timeout_mark_lost(sk);
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1920,40 +1980,10 @@ void tcp_enter_loss(struct sock *sk)
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
- tp->snd_cwnd = 1;
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
- tp->retrans_out = 0;
- tp->lost_out = 0;
-
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- skb = tcp_rtx_queue_head(sk);
- is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
- if (is_reneg) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
- tp->sacked_out = 0;
- /* Mark SACK reneging until we recover from this loss event. */
- tp->is_sack_reneg = 1;
- }
- tcp_clear_all_retrans_hints(tp);
-
- skb_rbtree_walk_from(skb) {
- mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
- is_reneg);
- if (mark_lost)
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (mark_lost) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- }
- tcp_verify_left_out(tp);
-
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
@@ -2120,7 +2150,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
return true;
/* Not-A-Trick#2 : Classic rule... */
- if (tcp_dupack_heuristics(tp) > tp->reordering)
+ if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
return true;
return false;
@@ -2197,9 +2227,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1, 1);
- } else {
+ if (tcp_is_sack(tp)) {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2697,12 +2725,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
return false;
}
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- /* Use RACK to detect loss */
- if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ if (tcp_rtx_queue_empty(sk))
+ return;
+
+ if (unlikely(tcp_is_reno(tp))) {
+ tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+ } else if (tcp_is_rack(sk)) {
u32 prior_retrans = tp->retrans_out;
tcp_rack_mark_lost(sk);
@@ -2798,11 +2830,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_try_keep_open(sk);
return;
}
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
@@ -2819,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -2841,7 +2873,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
fast_rexmit = 1;
}
- if (do_lost)
+ if (!tcp_is_rack(sk) && do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
@@ -3496,6 +3528,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
tcp_xmit_retransmit_queue(sk);
}
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delivered;
+
+ delivered = tp->delivered - prior_delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+ if (flag & FLAG_ECE) {
+ tp->delivered_ce += delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+ }
+ return delivered;
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3542,6 +3590,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ if (static_branch_unlikely(&clean_acked_data_enabled))
+ if (icsk->icsk_clean_acked)
+ icsk->icsk_clean_acked(sk, ack);
+#endif
}
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
@@ -3619,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3629,9 +3683,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
- if (flag & FLAG_DSACKING_ACK)
+ if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
+ }
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3655,6 +3711,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
&rexmit);
+ tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4126,7 +4183,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4196,6 +4253,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
+ if (tp->compressed_ack)
+ tcp_send_ack(sk);
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
@@ -4377,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
u32 seq, end_seq;
bool fragstolen;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4573,6 +4632,17 @@ err:
}
+void tcp_data_ready(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int avail = tp->rcv_nxt - tp->copied_seq;
+
+ if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
+ return;
+
+ sk->sk_data_ready(sk);
+}
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4630,7 +4700,7 @@ queue_and_out:
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ tcp_data_ready(sk);
return;
}
@@ -4640,7 +4710,7 @@ queue_and_out:
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
@@ -4651,8 +4721,6 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(sk);
-
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -5019,23 +5087,48 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long rtt, delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
- * (tcp_recvmsg() will send ACK otherwise). Or...
+ * (tcp_recvmsg() will send ACK otherwise).
+ * If application uses SO_RCVLOWAT, we want send ack now if
+ * we have not received enough bytes to satisfy the condition.
*/
- __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+ __tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(sk) ||
- /* We have out of order data. */
- (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
- /* Then ack it now */
+ tcp_in_quickack_mode(sk)) {
+send_now:
tcp_send_ack(sk);
- } else {
- /* Else, send delayed ack. */
+ return;
+ }
+
+ if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
+ return;
}
+
+ if (!tcp_is_sack(tp) ||
+ tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
+ goto send_now;
+ tp->compressed_ack++;
+
+ if (hrtimer_is_queued(&tp->compressed_ack_timer))
+ return;
+
+ /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+
+ rtt = tp->rcv_rtt_est.rtt_us;
+ if (tp->srtt_us && tp->srtt_us < rtt)
+ rtt = tp->srtt_us;
+
+ delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
+ rtt * (NSEC_PER_USEC >> 3)/20);
+ sock_hold(sk);
+ hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
@@ -5299,11 +5392,11 @@ discard:
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
- unsigned int len = skb->len;
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int len = skb->len;
/* TCP congestion window tracking */
trace_tcp_probe(sk, skb);
@@ -5428,7 +5521,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk);
+ tcp_data_ready(sk);
return;
}
}
@@ -5550,9 +5643,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return true;
}
tp->syn_data_acked = tp->syn_data;
- if (tp->syn_data_acked)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVE);
+ if (tp->syn_data_acked) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ /* SYN-data is counted as two separate packets in tcp_ack() */
+ if (tp->delivered > 1)
+ --tp->delivered;
+ }
tcp_fastopen_add_skb(sk, synack);
@@ -5698,7 +5794,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5884,6 +5980,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
+ tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c970626b398..fed3f1c66167 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
+ const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
+ int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
+
+ if (reuse == 2) {
+ /* Still does not detect *everything* that goes through
+ * lo, since we require a loopback src or dst address
+ * or direct binding to 'lo' interface.
+ */
+ bool loopback = false;
+ if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
+ loopback = true;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
+ (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
+ (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
+ (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
+ (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ loopback = true;
+ } else
+#endif
+ {
+ if (ipv4_is_loopback(tw->tw_daddr) ||
+ ipv4_is_loopback(tw->tw_rcv_saddr))
+ loopback = true;
+ }
+ if (!loopback)
+ reuse = 0;
+ }
/* With PAWS, it is safe from the viewpoint
of data integrity. Even without PAWS it is safe provided sequence
@@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
- get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (tp->write_seq == 0)
tp->write_seq = 1;
@@ -621,6 +650,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
struct net *net;
+ struct sock *ctl_sk;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -723,11 +753,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+ if (sk)
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
+ ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -759,6 +794,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
+ struct sock *ctl_sk;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -809,11 +845,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+ if (sk)
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
+ ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
@@ -1474,7 +1515,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_rx_dst = NULL;
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb));
+ tcp_rcv_established(sk, skb);
return 0;
}
@@ -2481,7 +2522,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_orphan_retries = 0;
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
- net->ipv4.sysctl_tcp_tw_reuse = 0;
+ net->ipv4.sysctl_tcp_tw_reuse = 2;
cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
@@ -2524,6 +2565,8 @@ static int __net_init tcp_sk_init(struct net *net)
init_net.ipv4.sysctl_tcp_wmem,
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
+ net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+ net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..1dda1341a223 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_sock *inet = inet_sk(sk);
tw->tw_transparent = inet->transparent;
+ tw->tw_mark = sk->sk_mark;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -306,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (timeo < rto)
timeo = rto;
- tw->tw_timeout = TCP_TIMEWAIT_LEN;
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07e34f8e309..8e08b409c71e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unlikely(tp->compressed_ack)) {
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack);
+ tp->compressed_ack = 0;
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
+ }
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -229,11 +238,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
}
}
- if (mss > (1 << *rcv_wscale)) {
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- }
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -585,14 +592,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
/* We always get an MSS option. The option bytes which will be seen in
@@ -720,14 +728,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->options = 0;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (unlikely(*md5)) {
- opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
if (likely(tp->rx_opt.tstamp_ok)) {
@@ -772,7 +781,7 @@ struct tsq_tasklet {
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
-static void tcp_tsq_handler(struct sock *sk)
+static void tcp_tsq_write(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +798,16 @@ static void tcp_tsq_handler(struct sock *sk)
0, GFP_ATOMIC);
}
}
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ tcp_tsq_write(sk);
+ else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ sock_hold(sk);
+ bh_unlock_sock(sk);
+}
/*
* One tasklet per cpu tries to send more skbs.
* We run in tasklet context but need to disable irqs when
@@ -816,16 +835,7 @@ static void tcp_tasklet_func(unsigned long data)
smp_mb__before_atomic();
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
- if (!sk->sk_lock.owned &&
- test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk)) {
- clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
- tcp_tsq_handler(sk);
- }
- bh_unlock_sock(sk);
- }
-
+ tcp_tsq_handler(sk);
sk_free(sk);
}
}
@@ -853,9 +863,10 @@ void tcp_release_cb(struct sock *sk)
nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & TCPF_TSQ_DEFERRED)
- tcp_tsq_handler(sk);
-
+ if (flags & TCPF_TSQ_DEFERRED) {
+ tcp_tsq_write(sk);
+ __sock_put(sk);
+ }
/* Here begins the tricky part :
* We are called from release_sock() with :
* 1) BH disabled
@@ -929,7 +940,7 @@ void tcp_wfree(struct sk_buff *skb)
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
@@ -948,37 +959,17 @@ out:
sk_free(sk);
}
-/* Note: Called under hard irq.
- * We can not call TCP stack right away.
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
*/
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
struct sock *sk = (struct sock *)tp;
- unsigned long nval, oval;
- for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
- struct tsq_tasklet *tsq;
- bool empty;
-
- if (oval & TSQF_QUEUED)
- break;
-
- nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
- nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
- if (nval != oval)
- continue;
+ tcp_tsq_handler(sk);
+ sock_put(sk);
- if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
- break;
- /* queue this socket to tasklet queue */
- tsq = this_cpu_ptr(&tsq_tasklet);
- empty = list_empty(&tsq->head);
- list_add(&tp->tsq_node, &tsq->head);
- if (empty)
- tasklet_schedule(&tsq->tasklet);
- break;
- }
return HRTIMER_NORESTART;
}
@@ -1011,7 +1002,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer,
ktime_add_ns(ktime_get(), len_ns),
- HRTIMER_MODE_ABS_PINNED);
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
}
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1070,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
- * which holds one reference to sk_wmem_alloc.
+ * which holds one reference to sk.
*
* TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2177,7 @@ static int tcp_mtu_probe(struct sock *sk)
static bool tcp_pacing_check(const struct sock *sk)
{
return tcp_needs_internal_pacing(sk) &&
- hrtimer_active(&tcp_sk(sk)->pacing_timer);
+ hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
}
/* TCP Small Queues :
@@ -2365,8 +2357,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
skb, limit, mss_now, gfp)))
break;
- if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
- clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..71593e4400ab 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}
+static u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->rack.reord) {
+ /* If reordering has not been observed, be aggressive during
+ * the recovery or starting the recovery by DUPACK threshold.
+ */
+ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+ return 0;
+
+ if (tp->sacked_out >= tp->reordering &&
+ !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+ return 0;
+ }
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay.
+ * Use min_rtt instead of the smoothed RTT because reordering is
+ * often a path property and less related to queuing or delayed ACKs.
+ * Upon receiving DSACKs, linearly increase the window up to the
+ * smoothed RTT.
+ */
+ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+ tp->srtt_us >> 3);
+}
+
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+ return tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+}
+
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 min_rtt = tcp_min_rtt(tp);
struct sk_buff *skb, *n;
u32 reo_wnd;
*reo_timeout = 0;
- /* To be more reordering resilient, allow min_rtt/4 settling delay
- * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
- * RTT because reordering is often a path property and less related
- * to queuing or delayed ACKs.
- */
- reo_wnd = 1000;
- if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
- min_rtt != ~0U) {
- reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
- reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
- }
-
+ reo_wnd = tcp_rack_reo_wnd(sk);
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
/* A packet is lost if it has not been s/acked beyond
* the recent RTT plus the reordering window.
*/
- remaining = tp->rack.rtt_us + reo_wnd -
- tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
if (remaining <= 0) {
- tcp_rack_mark_skb_lost(sk, skb);
+ tcp_mark_skb_lost(sk, skb);
list_del_init(&skb->tcp_tsorted_anchor);
} else {
/* Record maximum wait time */
@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
tp->rack.reo_wnd_steps = 1;
}
}
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+ const u8 state = inet_csk(sk)->icsk_ca_state;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+ (state == TCP_CA_Recovery && snd_una_advanced)) {
+ struct sk_buff *skb = tcp_rtx_queue_head(sk);
+ u32 mss;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ return;
+
+ mss = tcp_skb_mss(skb);
+ if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+ tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ mss, mss, GFP_ATOMIC);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ }
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8e..3b3611729928 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -708,11 +708,36 @@ out:
sock_put(sk);
}
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+ struct sock *sk = (struct sock *)tp;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ if (tp->compressed_ack)
+ tcp_send_ack(sk);
+ } else {
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return HRTIMER_NORESTART;
+}
+
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED);
+ HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+
+ hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED_SOFT);
+ tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 675433eb53a8..3365362cac88 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
-#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
- IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
@@ -757,7 +755,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
}
EXPORT_SYMBOL(udp_set_csum);
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +776,27 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (sk->sk_no_check_tx)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+ dst_xfrm(skb_dst(skb)))
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
+ cork->gso_size);
+ goto csum_partial;
+ }
+
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
@@ -786,6 +806,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
@@ -828,7 +849,7 @@ int udp_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &inet->cork.base);
out:
up->len = 0;
@@ -837,10 +858,48 @@ out:
}
EXPORT_SYMBOL(udp_push_pending_frames);
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+ switch (cmsg->cmsg_type) {
+ case UDP_SEGMENT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+ return -EINVAL;
+ *gso_size = *(__u16 *)CMSG_DATA(cmsg);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+ struct cmsghdr *cmsg;
+ bool need_ip = false;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_UDP) {
+ need_ip = true;
+ continue;
+ }
+
+ err = __udp_cmsg_send(cmsg, gso_size);
+ if (err)
+ return err;
+ }
+
+ return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
struct flowi4 fl4_stack;
struct flowi4 *fl4;
int ulen = len;
@@ -895,8 +954,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/*
* Get and verify the address.
*/
- if (msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (usin) {
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -922,10 +980,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
+ ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
- if (unlikely(err)) {
+ err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+ if (err > 0)
+ err = ip_cmsg_send(sk, msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
@@ -946,6 +1008,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
+ if (cgroup_bpf_enabled && !connected) {
+ err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+ (struct sockaddr *)usin, &ipc.addr);
+ if (err)
+ goto out_free;
+ if (usin) {
+ if (usin->sin_port == 0) {
+ /* BPF program set invalid port. Reject it. */
+ err = -EINVAL;
+ goto out_free;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ }
+ }
+
saddr = ipc.addr;
ipc.addr = faddr = daddr;
@@ -1032,12 +1110,14 @@ back_from_confirm:
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
+ struct inet_cork cork;
+
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
- msg->msg_flags);
+ &cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_send_skb(skb, fl4);
+ err = udp_send_skb(skb, fl4, &cork);
goto out;
}
@@ -1813,10 +1893,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static struct static_key udp_encap_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
- static_key_enable(&udp_encap_needed);
+ static_branch_enable(&udp_encap_needed_key);
}
EXPORT_SYMBOL(udp_encap_enable);
@@ -1840,7 +1920,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
nf_reset(skb);
- if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -2303,7 +2383,7 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
void (*encap_destroy)(struct sock *sk);
encap_destroy = READ_ONCE(up->encap_destroy);
if (encap_destroy)
@@ -2368,6 +2448,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->no_check6_rx = valbool;
break;
+ case UDP_SEGMENT:
+ if (val < 0 || val > USHRT_MAX)
+ return -EINVAL;
+ up->gso_size = val;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -2458,6 +2544,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->no_check6_rx;
break;
+ case UDP_SEGMENT:
+ val = up->gso_size;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..92dc9e5a7ff3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,102 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+ netdev_features_t features)
+{
+ struct sock *sk = gso_skb->sk;
+ unsigned int sum_truesize = 0;
+ struct sk_buff *segs, *seg;
+ struct udphdr *uh;
+ unsigned int mss;
+ bool copy_dtor;
+ __sum16 check;
+ __be16 newlen;
+
+ mss = skb_shinfo(gso_skb)->gso_size;
+ if (gso_skb->len <= sizeof(*uh) + mss)
+ return ERR_PTR(-EINVAL);
+
+ skb_pull(gso_skb, sizeof(*uh));
+
+ /* clear destructor to avoid skb_segment assigning it to tail */
+ copy_dtor = gso_skb->destructor == sock_wfree;
+ if (copy_dtor)
+ gso_skb->destructor = NULL;
+
+ segs = skb_segment(gso_skb, features);
+ if (unlikely(IS_ERR_OR_NULL(segs))) {
+ if (copy_dtor)
+ gso_skb->destructor = sock_wfree;
+ return segs;
+ }
+
+ /* GSO partial and frag_list segmentation only requires splitting
+ * the frame into an MSS multiple and possibly a remainder, both
+ * cases return a GSO skb. So update the mss now.
+ */
+ if (skb_is_gso(segs))
+ mss *= skb_shinfo(segs)->gso_segs;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+
+ /* compute checksum adjustment based on old length versus new */
+ newlen = htons(sizeof(*uh) + mss);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ for (;;) {
+ if (copy_dtor) {
+ seg->destructor = sock_wfree;
+ seg->sk = sk;
+ sum_truesize += seg->truesize;
+ }
+
+ if (!seg->next)
+ break;
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? :
+ CSUM_MANGLED_0;
+
+ seg = seg->next;
+ uh = udp_hdr(seg);
+ }
+
+ /* last packet can be partial gso_size, account for that in checksum */
+ newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+ seg->data_len);
+ check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+ uh->len = newlen;
+ uh->check = check;
+
+ if (seg->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(seg, ~check);
+ else
+ uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
+ /* update refcount for the packet */
+ if (copy_dtor) {
+ int delta = sum_truesize - gso_skb->truesize;
+
+ /* In some pathological cases, delta can be negative.
+ * We need to either use refcount_add() or refcount_sub_and_test()
+ */
+ if (likely(delta >= 0))
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ else
+ WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+ }
+ return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -203,12 +299,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
}
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp_gso_segment(skb, features);
+
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
If unsure, say N.
+config IPV6_SEG6_BPF
+ def_bool y
+ depends on IPV6_SEG6_LWTUNNEL
+ depends on IPV6 = y
+
endif # IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b5ea3379d9b..89019bf59f46 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev,
unsigned long event);
static int addrconf_ifdown(struct net_device *dev, int how);
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
u32 flags, u32 noflags);
@@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
pr_warn("Freeing alive inet6 address %p\n", ifp);
return;
}
- ip6_rt_put(ifp->rt);
kfree_rcu(ifp, rcu);
}
@@ -987,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
/* On success it returns ifp with increased reference count */
static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
- const struct in6_addr *peer_addr, int pfxlen,
- int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
bool can_block, struct netlink_ext_ack *extack)
{
gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+ int addr_type = ipv6_addr_type(cfg->pfx);
struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
- struct rt6_info *rt = NULL;
+ struct fib6_info *f6i = NULL;
int err = 0;
- int addr_type = ipv6_addr_type(addr);
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
@@ -1020,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
*/
if (can_block) {
struct in6_validator_info i6vi = {
- .i6vi_addr = *addr,
+ .i6vi_addr = *cfg->pfx,
.i6vi_dev = idev,
.extack = extack,
};
@@ -1037,38 +1034,39 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
goto out;
}
- rt = addrconf_dst_alloc(idev, addr, false);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
+ f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
+ f6i = NULL;
goto out;
}
if (net->ipv6.devconf_all->disable_policy ||
idev->cnf.disable_policy)
- rt->dst.flags |= DST_NOPOLICY;
+ f6i->dst_nopolicy = true;
neigh_parms_data_state_setall(idev->nd_parms);
- ifa->addr = *addr;
- if (peer_addr)
- ifa->peer_addr = *peer_addr;
+ ifa->addr = *cfg->pfx;
+ if (cfg->peer_pfx)
+ ifa->peer_addr = *cfg->peer_pfx;
spin_lock_init(&ifa->lock);
INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
INIT_HLIST_NODE(&ifa->addr_lst);
- ifa->scope = scope;
- ifa->prefix_len = pfxlen;
- ifa->flags = flags;
+ ifa->scope = cfg->scope;
+ ifa->prefix_len = cfg->plen;
+ ifa->rt_priority = cfg->rt_priority;
+ ifa->flags = cfg->ifa_flags;
/* No need to add the TENTATIVE flag for addresses with NODAD */
- if (!(flags & IFA_F_NODAD))
+ if (!(cfg->ifa_flags & IFA_F_NODAD))
ifa->flags |= IFA_F_TENTATIVE;
- ifa->valid_lft = valid_lft;
- ifa->prefered_lft = prefered_lft;
+ ifa->valid_lft = cfg->valid_lft;
+ ifa->prefered_lft = cfg->preferred_lft;
ifa->cstamp = ifa->tstamp = jiffies;
ifa->tokenized = false;
- ifa->rt = rt;
+ ifa->rt = f6i;
ifa->idev = idev;
in6_dev_hold(idev);
@@ -1102,8 +1100,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
if (unlikely(err < 0)) {
- if (rt)
- ip6_rt_put(rt);
+ fib6_info_release(f6i);
+
if (ifa) {
if (ifa->idev)
in6_dev_put(ifa->idev);
@@ -1179,19 +1177,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
static void
cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
{
- struct rt6_info *rt;
+ struct fib6_info *f6i;
- rt = addrconf_get_prefix_route(&ifp->addr,
+ f6i = addrconf_get_prefix_route(&ifp->addr,
ifp->prefix_len,
ifp->idev->dev,
0, RTF_GATEWAY | RTF_DEFAULT);
- if (rt) {
+ if (f6i) {
if (del_rt)
- ip6_del_rt(rt);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
else {
- if (!(rt->rt6i_flags & RTF_EXPIRES))
- rt6_set_expires(rt, expires);
- ip6_rt_put(rt);
+ if (!(f6i->fib6_flags & RTF_EXPIRES))
+ fib6_set_expires(f6i, expires);
+ fib6_info_release(f6i);
}
}
}
@@ -1261,11 +1259,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
{
struct inet6_dev *idev = ifp->idev;
struct in6_addr addr, *tmpaddr;
- unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+ unsigned long tmp_tstamp, age;
unsigned long regen_advance;
- int tmp_plen;
+ struct ifa6_config cfg;
int ret = 0;
- u32 addr_flags;
unsigned long now = jiffies;
long max_desync_factor;
s32 cnf_temp_preferred_lft;
@@ -1327,13 +1324,12 @@ retry:
}
}
- tmp_valid_lft = min_t(__u32,
- ifp->valid_lft,
+ cfg.valid_lft = min_t(__u32, ifp->valid_lft,
idev->cnf.temp_valid_lft + age);
- tmp_prefered_lft = cnf_temp_preferred_lft + age -
- idev->desync_factor;
- tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
- tmp_plen = ifp->prefix_len;
+ cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+ cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+
+ cfg.plen = ifp->prefix_len;
tmp_tstamp = ifp->tstamp;
spin_unlock_bh(&ifp->lock);
@@ -1347,21 +1343,23 @@ retry:
* temporary addresses being generated.
*/
age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
- if (tmp_prefered_lft <= regen_advance + age) {
+ if (cfg.preferred_lft <= regen_advance + age) {
in6_ifa_put(ifp);
in6_dev_put(idev);
ret = -1;
goto out;
}
- addr_flags = IFA_F_TEMPORARY;
+ cfg.ifa_flags = IFA_F_TEMPORARY;
/* set in addrconf_prefix_rcv() */
if (ifp->flags & IFA_F_OPTIMISTIC)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
- ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
- ipv6_addr_scope(&addr), addr_flags,
- tmp_valid_lft, tmp_prefered_lft, block, NULL);
+ cfg.pfx = &addr;
+ cfg.scope = ipv6_addr_scope(cfg.pfx);
+ cfg.rt_priority = 0;
+
+ ift = ipv6_add_addr(idev, &cfg, block, NULL);
if (IS_ERR(ift)) {
in6_ifa_put(ifp);
in6_dev_put(idev);
@@ -2032,13 +2030,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
spin_lock_bh(&ifp->lock);
if (ifp->flags & IFA_F_STABLE_PRIVACY) {
- int scope = ifp->scope;
- u32 flags = ifp->flags;
struct in6_addr new_addr;
struct inet6_ifaddr *ifp2;
- u32 valid_lft, preferred_lft;
- int pfxlen = ifp->prefix_len;
int retries = ifp->stable_privacy_retry + 1;
+ struct ifa6_config cfg = {
+ .pfx = &new_addr,
+ .plen = ifp->prefix_len,
+ .ifa_flags = ifp->flags,
+ .valid_lft = ifp->valid_lft,
+ .preferred_lft = ifp->prefered_lft,
+ .scope = ifp->scope,
+ };
if (retries > net->ipv6.sysctl.idgen_retries) {
net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2051,9 +2053,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
idev))
goto errdad;
- valid_lft = ifp->valid_lft;
- preferred_lft = ifp->prefered_lft;
-
spin_unlock_bh(&ifp->lock);
if (idev->cnf.max_addresses &&
@@ -2064,9 +2063,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
ifp->idev->dev->name);
- ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
- scope, flags, valid_lft,
- preferred_lft, false, NULL);
+ ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
if (IS_ERR(ifp2))
goto lock_errdad;
@@ -2254,6 +2251,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
return addrconf_ifid_ieee1394(eui, dev);
case ARPHRD_TUNNEL6:
case ARPHRD_IP6GRE:
+ case ARPHRD_RAWIP:
return addrconf_ifid_ip6tnl(eui, dev);
}
return -1;
@@ -2319,18 +2317,20 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
*/
static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, u32 flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+ struct net_device *dev, unsigned long expires,
+ u32 flags, gfp_t gfp_flags)
{
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
- .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
.fc_dst_len = plen,
.fc_flags = RTF_UP | flags,
.fc_nlinfo.nl_net = dev_net(dev),
.fc_protocol = RTPROT_KERNEL,
+ .fc_type = RTN_UNICAST,
};
cfg.fc_dst = *pfx;
@@ -2344,17 +2344,17 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
cfg.fc_flags |= RTF_NONEXTHOP;
#endif
- ip6_route_add(&cfg, NULL);
+ ip6_route_add(&cfg, gfp_flags, NULL);
}
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
u32 flags, u32 noflags)
{
struct fib6_node *fn;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
struct fib6_table *table;
u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
@@ -2368,14 +2368,13 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->dst.dev->ifindex != dev->ifindex)
+ if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
continue;
- if ((rt->rt6i_flags & flags) != flags)
+ if ((rt->fib6_flags & flags) != flags)
continue;
- if ((rt->rt6i_flags & noflags) != 0)
+ if ((rt->fib6_flags & noflags) != 0)
continue;
- if (!dst_hold_safe(&rt->dst))
- rt = NULL;
+ fib6_info_hold(rt);
break;
}
out:
@@ -2394,12 +2393,13 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
+ .fc_type = RTN_UNICAST,
.fc_nlinfo.nl_net = dev_net(dev),
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
- ip6_route_add(&cfg, NULL);
+ ip6_route_add(&cfg, GFP_ATOMIC, NULL);
}
static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2507,12 +2507,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
if (!ifp && valid_lft) {
int max_addresses = in6_dev->cnf.max_addresses;
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = pinfo->prefix_len,
+ .ifa_flags = addr_flags,
+ .valid_lft = valid_lft,
+ .preferred_lft = prefered_lft,
+ .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+ };
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if ((net->ipv6.devconf_all->optimistic_dad ||
in6_dev->cnf.optimistic_dad) &&
!net->ipv6.devconf_all->forwarding && sllao)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
/* Do not allow to create too much of autoconfigured
@@ -2520,16 +2528,11 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
*/
if (!max_addresses ||
ipv6_count_addresses(in6_dev) < max_addresses)
- ifp = ipv6_add_addr(in6_dev, addr, NULL,
- pinfo->prefix_len,
- addr_type&IPV6_ADDR_SCOPE_MASK,
- addr_flags, valid_lft,
- prefered_lft, false, NULL);
+ ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
if (IS_ERR_OR_NULL(ifp))
return -1;
- update_lft = 0;
create = 1;
spin_lock_bh(&ifp->lock);
ifp->flags |= IFA_F_MANAGETEMPADDR;
@@ -2551,7 +2554,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
else
stored_lft = 0;
- if (!update_lft && !create && stored_lft) {
+ if (!create && stored_lft) {
const u32 minimum_lft = min_t(u32,
stored_lft, MIN_VALID_LIFETIME);
valid_lft = max(valid_lft, minimum_lft);
@@ -2642,7 +2645,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
*/
if (pinfo->onlink) {
- struct rt6_info *rt;
+ struct fib6_info *rt;
unsigned long rt_expires;
/* Avoid arithmetic overflow. Really, we could
@@ -2667,13 +2670,13 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
rt = NULL;
} else if (addrconf_finite_timeout(rt_expires)) {
/* not infinity */
- rt6_set_expires(rt, jiffies + rt_expires);
+ fib6_set_expires(rt, jiffies + rt_expires);
} else {
- rt6_clean_expires(rt);
+ fib6_clean_expires(rt);
}
} else if (valid_lft) {
clock_t expires = 0;
@@ -2684,9 +2687,10 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
expires = jiffies_to_clock_t(rt_expires);
}
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, expires, flags);
+ 0, dev, expires, flags,
+ GFP_ATOMIC);
}
- ip6_rt_put(rt);
+ fib6_info_release(rt);
}
/* Try to figure out our local address for this prefix */
@@ -2831,10 +2835,7 @@ static int ipv6_mc_config(struct sock *sk, bool join,
* Manual configuration of address on an interface
*/
static int inet6_addr_add(struct net *net, int ifindex,
- const struct in6_addr *pfx,
- const struct in6_addr *peer_pfx,
- unsigned int plen, __u32 ifa_flags,
- __u32 prefered_lft, __u32 valid_lft,
+ struct ifa6_config *cfg,
struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
@@ -2842,19 +2843,18 @@ static int inet6_addr_add(struct net *net, int ifindex,
struct net_device *dev;
unsigned long timeout;
clock_t expires;
- int scope;
u32 flags;
ASSERT_RTNL();
- if (plen > 128)
+ if (cfg->plen > 128)
return -EINVAL;
/* check the lifetime */
- if (!valid_lft || prefered_lft > valid_lft)
+ if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
return -EINVAL;
- if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64)
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
return -EINVAL;
dev = __dev_get_by_index(net, ifindex);
@@ -2865,58 +2865,62 @@ static int inet6_addr_add(struct net *net, int ifindex,
if (IS_ERR(idev))
return PTR_ERR(idev);
- if (ifa_flags & IFA_F_MCAUTOJOIN) {
+ if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- true, pfx, ifindex);
+ true, cfg->pfx, ifindex);
if (ret < 0)
return ret;
}
- scope = ipv6_addr_scope(pfx);
+ cfg->scope = ipv6_addr_scope(cfg->pfx);
- timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
expires = jiffies_to_clock_t(timeout * HZ);
- valid_lft = timeout;
+ cfg->valid_lft = timeout;
flags = RTF_EXPIRES;
} else {
expires = 0;
flags = 0;
- ifa_flags |= IFA_F_PERMANENT;
+ cfg->ifa_flags |= IFA_F_PERMANENT;
}
- timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- prefered_lft = timeout;
+ cfg->ifa_flags |= IFA_F_DEPRECATED;
+ cfg->preferred_lft = timeout;
}
- ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
- valid_lft, prefered_lft, true, extack);
-
+ ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
- if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
- expires, flags);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, dev, expires,
+ flags, GFP_KERNEL);
}
+ /* Send a netlink notification if DAD is enabled and
+ * optimistic flag is not set
+ */
+ if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
+ ipv6_ifa_notify(0, ifp);
/*
* Note that section 3.1 of RFC 4429 indicates
* that the Optimistic flag should not be set for
* manually configured addresses
*/
addrconf_dad_start(ifp);
- if (ifa_flags & IFA_F_MANAGETEMPADDR)
- manage_tempaddrs(idev, ifp, valid_lft, prefered_lft,
- true, jiffies);
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
+ manage_tempaddrs(idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
addrconf_verify_rtnl();
return 0;
- } else if (ifa_flags & IFA_F_MCAUTOJOIN) {
- ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- false, pfx, ifindex);
+ } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
+ cfg->pfx, ifindex);
}
return PTR_ERR(ifp);
@@ -2967,6 +2971,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
int addrconf_add_ifaddr(struct net *net, void __user *arg)
{
+ struct ifa6_config cfg = {
+ .ifa_flags = IFA_F_PERMANENT,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .valid_lft = INFINITY_LIFE_TIME,
+ };
struct in6_ifreq ireq;
int err;
@@ -2976,10 +2985,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
+ cfg.pfx = &ireq.ifr6_addr;
+ cfg.plen = ireq.ifr6_prefixlen;
+
rtnl_lock();
- err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
- ireq.ifr6_prefixlen, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
+ err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
rtnl_unlock();
return err;
}
@@ -3006,11 +3016,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
int plen, int scope)
{
struct inet6_ifaddr *ifp;
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = plen,
+ .ifa_flags = IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = scope
+ };
- ifp = ipv6_add_addr(idev, addr, NULL, plen,
- scope, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
- true, NULL);
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
spin_lock_bh(&ifp->lock);
ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3047,7 +3062,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
if (addr.s6_addr32[3]) {
add_addr(idev, &addr, plen, scope);
- addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
+ GFP_ATOMIC);
return;
}
@@ -3071,8 +3087,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
}
add_addr(idev, &addr, plen, flag);
- addrconf_prefix_route(&addr, plen, idev->dev, 0,
- pflags);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev,
+ 0, pflags, GFP_ATOMIC);
}
}
}
@@ -3099,20 +3115,27 @@ static void init_loopback(struct net_device *dev)
void addrconf_add_linklocal(struct inet6_dev *idev,
const struct in6_addr *addr, u32 flags)
{
+ struct ifa6_config cfg = {
+ .pfx = addr,
+ .plen = 64,
+ .ifa_flags = flags | IFA_F_PERMANENT,
+ .valid_lft = INFINITY_LIFE_TIME,
+ .preferred_lft = INFINITY_LIFE_TIME,
+ .scope = IFA_LINK
+ };
struct inet6_ifaddr *ifp;
- u32 addr_flags = flags | IFA_F_PERMANENT;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
idev->cnf.optimistic_dad) &&
!dev_net(idev->dev)->ipv6.devconf_all->forwarding)
- addr_flags |= IFA_F_OPTIMISTIC;
+ cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
- ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
+ ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
+ 0, 0, GFP_ATOMIC);
addrconf_dad_start(ifp);
in6_ifa_put(ifp);
}
@@ -3227,7 +3250,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
addrconf_add_linklocal(idev, &addr,
IFA_F_STABLE_PRIVACY);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_EUI64:
/* addrconf_add_linklocal also adds a prefix_route and we
@@ -3237,7 +3261,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
addrconf_add_linklocal(idev, &addr, 0);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
+ 0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_NONE:
default:
@@ -3262,7 +3287,8 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_IP6GRE) &&
(dev->type != ARPHRD_IPGRE) &&
(dev->type != ARPHRD_TUNNEL) &&
- (dev->type != ARPHRD_NONE)) {
+ (dev->type != ARPHRD_NONE) &&
+ (dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
}
@@ -3329,32 +3355,35 @@ static void addrconf_gre_config(struct net_device *dev)
}
#endif
-static int fixup_permanent_addr(struct inet6_dev *idev,
+static int fixup_permanent_addr(struct net *net,
+ struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
{
- /* !rt6i_node means the host route was removed from the
+ /* !fib6_node means the host route was removed from the
* FIB, for example, if 'lo' device is taken down. In that
* case regenerate the host route.
*/
- if (!ifp->rt || !ifp->rt->rt6i_node) {
- struct rt6_info *rt, *prev;
+ if (!ifp->rt || !ifp->rt->fib6_node) {
+ struct fib6_info *f6i, *prev;
- rt = addrconf_dst_alloc(idev, &ifp->addr, false);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
+ f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+ GFP_ATOMIC);
+ if (IS_ERR(f6i))
+ return PTR_ERR(f6i);
/* ifp->rt can be accessed outside of rtnl */
spin_lock(&ifp->lock);
prev = ifp->rt;
- ifp->rt = rt;
+ ifp->rt = f6i;
spin_unlock(&ifp->lock);
- ip6_rt_put(prev);
+ fib6_info_release(prev);
}
if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
- idev->dev, 0, 0);
+ ifp->rt_priority, idev->dev, 0, 0,
+ GFP_ATOMIC);
}
if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -3363,7 +3392,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
return 0;
}
-static void addrconf_permanent_addr(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
{
struct inet6_ifaddr *ifp, *tmp;
struct inet6_dev *idev;
@@ -3376,7 +3405,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
if ((ifp->flags & IFA_F_PERMANENT) &&
- fixup_permanent_addr(idev, ifp) < 0) {
+ fixup_permanent_addr(net, idev, ifp) < 0) {
write_unlock_bh(&idev->lock);
in6_ifa_hold(ifp);
ipv6_del_addr(ifp);
@@ -3445,7 +3474,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (event == NETDEV_UP) {
/* restore routes for permanent addresses */
- addrconf_permanent_addr(dev);
+ addrconf_permanent_addr(net, dev);
if (!addrconf_link_ready(dev)) {
/* device is not ready yet. */
@@ -3612,8 +3641,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa, *tmp;
- int _keep_addr;
- bool keep_addr;
+ bool keep_addr = false;
int state, i;
ASSERT_RTNL();
@@ -3639,15 +3667,18 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
- /* aggregate the system setting and interface setting */
- _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
- if (!_keep_addr)
- _keep_addr = idev->cnf.keep_addr_on_down;
-
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
+ if (!how && !idev->cnf.disable_ipv6) {
+ /* aggregate the system setting and interface setting */
+ int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+
+ if (!_keep_addr)
+ _keep_addr = idev->cnf.keep_addr_on_down;
+
+ keep_addr = (_keep_addr > 0);
+ }
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3697,13 +3728,8 @@ restart:
write_lock_bh(&idev->lock);
}
- /* re-combine the user config with event to determine if permanent
- * addresses are to be removed from the interface list
- */
- keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
-
list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
bool keep;
addrconf_del_dad_work(ifa);
@@ -3731,7 +3757,7 @@ restart:
spin_unlock_bh(&ifa->lock);
if (rt)
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3849,6 +3875,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
bool bump_id, notify = false;
+ struct net *net;
addrconf_join_solict(dev, &ifp->addr);
@@ -3859,8 +3886,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
if (ifp->state == INET6_IFADDR_STATE_DEAD)
goto out;
+ net = dev_net(dev);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
- (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+ (net->ipv6.devconf_all->accept_dad < 1 &&
idev->cnf.accept_dad < 1) ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
@@ -3896,8 +3924,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
* Frames right away
*/
if (ifp->flags & IFA_F_OPTIMISTIC) {
- ip6_ins_rt(ifp->rt);
- if (ipv6_use_optimistic_addr(dev_net(dev), idev)) {
+ ip6_ins_rt(net, ifp->rt);
+ if (ipv6_use_optimistic_addr(net, idev)) {
/* Because optimistic nodes can use this address,
* notify listeners. If DAD fails, RTM_DELADDR is sent.
*/
@@ -4463,6 +4491,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .len = sizeof(u32) },
+ [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
};
static int
@@ -4495,8 +4524,38 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm->ifa_prefixlen);
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
- u32 prefered_lft, u32 valid_lft)
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+ unsigned long expires, u32 flags)
+{
+ struct fib6_info *f6i;
+
+ f6i = addrconf_get_prefix_route(&ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev,
+ 0, RTF_GATEWAY | RTF_DEFAULT);
+ if (!f6i)
+ return -ENOENT;
+
+ if (f6i->fib6_metric != ifp->rt_priority) {
+ /* add new one */
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ /* delete old one */
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ } else {
+ if (!expires)
+ fib6_clean_expires(f6i);
+ else
+ fib6_set_expires(f6i, expires);
+
+ fib6_info_release(f6i);
+ }
+
+ return 0;
+}
+
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
{
u32 flags;
clock_t expires;
@@ -4506,32 +4565,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ASSERT_RTNL();
- if (!valid_lft || (prefered_lft > valid_lft))
+ if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
return -EINVAL;
- if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
(ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
return -EINVAL;
if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
- ifa_flags &= ~IFA_F_OPTIMISTIC;
+ cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
expires = jiffies_to_clock_t(timeout * HZ);
- valid_lft = timeout;
+ cfg->valid_lft = timeout;
flags = RTF_EXPIRES;
} else {
expires = 0;
flags = 0;
- ifa_flags |= IFA_F_PERMANENT;
+ cfg->ifa_flags |= IFA_F_PERMANENT;
}
- timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- prefered_lft = timeout;
+ cfg->ifa_flags |= IFA_F_DEPRECATED;
+ cfg->preferred_lft = timeout;
}
spin_lock_bh(&ifp->lock);
@@ -4541,18 +4600,30 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
IFA_F_NOPREFIXROUTE);
- ifp->flags |= ifa_flags;
+ ifp->flags |= cfg->ifa_flags;
ifp->tstamp = jiffies;
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
+ ifp->valid_lft = cfg->valid_lft;
+ ifp->prefered_lft = cfg->preferred_lft;
+
+ if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+ ifp->rt_priority = cfg->rt_priority;
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
- if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
- expires, flags);
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ int rc = -ENOENT;
+
+ if (had_prefixroute)
+ rc = modify_prefix_route(ifp, expires, flags);
+
+ /* prefix route could have been deleted; if so restore it */
+ if (rc == -ENOENT) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
} else if (had_prefixroute) {
enum cleanup_prefix_rt_t action;
unsigned long rt_expires;
@@ -4568,10 +4639,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
}
if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
- if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
- valid_lft = prefered_lft = 0;
- manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
- !was_managetempaddr, jiffies);
+ if (was_managetempaddr &&
+ !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
+ cfg->valid_lft = 0;
+ cfg->preferred_lft = 0;
+ }
+ manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, !was_managetempaddr,
+ jiffies);
}
addrconf_verify_rtnl();
@@ -4586,12 +4661,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
- struct in6_addr *pfx, *peer_pfx;
+ struct in6_addr *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
struct inet6_dev *idev;
- u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
- u32 ifa_flags;
+ struct ifa6_config cfg;
int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -4599,60 +4673,70 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ memset(&cfg, 0, sizeof(cfg));
+
ifm = nlmsg_data(nlh);
- pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
- if (!pfx)
+ cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!cfg.pfx)
return -EINVAL;
+ cfg.peer_pfx = peer_pfx;
+ cfg.plen = ifm->ifa_prefixlen;
+ if (tb[IFA_RT_PRIORITY])
+ cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
+ cfg.valid_lft = INFINITY_LIFE_TIME;
+ cfg.preferred_lft = INFINITY_LIFE_TIME;
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
ci = nla_data(tb[IFA_CACHEINFO]);
- valid_lft = ci->ifa_valid;
- preferred_lft = ci->ifa_prefered;
- } else {
- preferred_lft = INFINITY_LIFE_TIME;
- valid_lft = INFINITY_LIFE_TIME;
+ cfg.valid_lft = ci->ifa_valid;
+ cfg.preferred_lft = ci->ifa_prefered;
}
dev = __dev_get_by_index(net, ifm->ifa_index);
if (!dev)
return -ENODEV;
- ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+ if (tb[IFA_FLAGS])
+ cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
+ else
+ cfg.ifa_flags = ifm->ifa_flags;
/* We ignore other flags so far. */
- ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
- IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+ IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+ IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
idev = ipv6_find_idev(dev);
if (IS_ERR(idev))
return PTR_ERR(idev);
if (!ipv6_allow_optimistic_dad(net, idev))
- ifa_flags &= ~IFA_F_OPTIMISTIC;
+ cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
- if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) {
+ if (cfg.ifa_flags & IFA_F_NODAD &&
+ cfg.ifa_flags & IFA_F_OPTIMISTIC) {
NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
return -EINVAL;
}
- ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+ ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
if (!ifa) {
/*
* It would be best to check for !NLM_F_CREATE here but
* userspace already relies on not having to provide this.
*/
- return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
- ifm->ifa_prefixlen, ifa_flags,
- preferred_lft, valid_lft, extack);
+ return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
err = -EEXIST;
else
- err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+ err = inet6_addr_modify(ifa, &cfg);
in6_ifa_put(ifa);
@@ -4703,7 +4787,8 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(16) /* IFA_LOCAL */
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo))
- + nla_total_size(4) /* IFA_FLAGS */;
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */;
}
static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4749,6 +4834,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
goto error;
+ if (ifa->rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+ goto error;
+
if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
goto error;
@@ -4792,9 +4881,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
u32 portid, u32 seq, int event, unsigned int flags)
{
+ struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+ int ifindex = dev ? dev->ifindex : 1;
struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
- int ifindex = ifaca->aca_idev->dev->ifindex;
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
@@ -5017,14 +5107,6 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
struct net *net = dev_net(ifa->idev->dev);
int err = -ENOBUFS;
- /* Don't send DELADDR notification for TENTATIVE address,
- * since NEWADDR notification is sent only after removing
- * TENTATIVE flag, if DAD has not failed.
- */
- if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) &&
- event == RTM_DELADDR)
- return;
-
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -5595,29 +5677,30 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
* our DAD process, so we don't need
* to do it again
*/
- if (!rcu_access_pointer(ifp->rt->rt6i_node))
- ip6_ins_rt(ifp->rt);
+ if (!rcu_access_pointer(ifp->rt->fib6_node))
+ ip6_ins_rt(net, ifp->rt);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
- addrconf_prefix_route(&ifp->peer_addr, 128,
- ifp->idev->dev, 0, 0);
+ addrconf_prefix_route(&ifp->peer_addr, 128, 0,
+ ifp->idev->dev, 0, 0,
+ GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
addrconf_leave_anycast(ifp);
addrconf_leave_solict(ifp->idev, &ifp->addr);
if (!ipv6_addr_any(&ifp->peer_addr)) {
- struct rt6_info *rt;
+ struct fib6_info *rt;
rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
ifp->idev->dev, 0, 0);
if (rt)
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
}
if (ifp->rt) {
- if (dst_hold_safe(&ifp->rt->dst))
- ip6_del_rt(ifp->rt);
+ ip6_del_rt(net, ifp->rt);
+ ifp->rt = NULL;
}
rt_genid_bump_ipv6(net);
break;
@@ -5964,11 +6047,11 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct rt6_info *rt = ifa->rt;
+ struct fib6_info *rt = ifa->rt;
int cpu;
rcu_read_lock();
- addrconf_set_nopolicy(ifa->rt, val);
+ ifa->rt->dst_nopolicy = val ? true : false;
if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..5cd0029d930e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,47 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb, int strict)
+{
+ return f6i;
+}
+
+static u32
+eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+ struct in6_addr *saddr)
+{
+ return 0;
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .fib6_get_table = eafnosupport_fib6_get_table,
+ .fib6_table_lookup = eafnosupport_fib6_table_lookup,
+ .fib6_lookup = eafnosupport_fib6_lookup,
+ .fib6_multipath_select = eafnosupport_fib6_multipath_select,
+ .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d443c18b45fe..74f2a261e8df 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,33 +273,8 @@ out_rcu_unlock:
goto out;
}
-
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- /* If the socket has its own bind function then use it. */
- if (sk->sk_prot->bind)
- return sk->sk_prot->bind(sk, uaddr, addr_len);
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- /* BPF prog is run before any checks are done so that if the prog
- * changes context in a wrong way it will be caught.
- */
- err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
- if (err)
- return err;
-
- return __inet6_bind(sk, uaddr, addr_len, false, true);
-}
-EXPORT_SYMBOL(inet6_bind);
-
-int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- bool force_bind_address_no_port, bool with_lock)
+static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+ bool force_bind_address_no_port, bool with_lock)
{
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -444,6 +419,30 @@ out_unlock:
goto out;
}
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* If the socket has its own bind function then use it. */
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ /* BPF prog is run before any checks are done so that if the prog
+ * changes context in a wrong way it will be caught.
+ */
+ err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+ if (err)
+ return err;
+
+ return __inet6_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet6_bind);
+
int inet6_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -579,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
- .mmap = sock_no_mmap,
+#ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+#endif
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
@@ -590,6 +591,7 @@ const struct proto_ops inet6_stream_ops = {
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
+ .set_rcvlowat = tcp_set_rcvlowat,
};
const struct proto_ops inet6_dgram_ops = {
@@ -887,7 +889,12 @@ static struct pernet_operations inet6_net_ops = {
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
- .ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_dst_lookup = ip6_dst_lookup,
+ .fib6_get_table = fib6_get_table,
+ .fib6_table_lookup = fib6_table_lookup,
+ .fib6_lookup = fib6_lookup,
+ .fib6_multipath_select = fib6_multipath_select,
+ .ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
.nd_tbl = &nd_tbl,
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index ebeaf47d5c8d..4e0ff7031edd 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,16 +212,14 @@ static void aca_get(struct ifacaddr6 *aca)
static void aca_put(struct ifacaddr6 *ac)
{
if (refcount_dec_and_test(&ac->aca_refcnt)) {
- in6_dev_put(ac->aca_idev);
- dst_release(&ac->aca_rt->dst);
+ fib6_info_release(ac->aca_rt);
kfree(ac);
}
}
-static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
const struct in6_addr *addr)
{
- struct inet6_dev *idev = rt->rt6i_idev;
struct ifacaddr6 *aca;
aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,9 +227,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
return NULL;
aca->aca_addr = *addr;
- in6_dev_hold(idev);
- aca->aca_idev = idev;
- aca->aca_rt = rt;
+ fib6_info_hold(f6i);
+ aca->aca_rt = f6i;
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -246,7 +243,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
- struct rt6_info *rt;
+ struct fib6_info *f6i;
+ struct net *net;
int err;
ASSERT_RTNL();
@@ -265,14 +263,15 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
}
}
- rt = addrconf_dst_alloc(idev, addr, true);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
+ net = dev_net(idev->dev);
+ f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+ if (IS_ERR(f6i)) {
+ err = PTR_ERR(f6i);
goto out;
}
- aca = aca_alloc(rt, addr);
+ aca = aca_alloc(f6i, addr);
if (!aca) {
- ip6_rt_put(rt);
+ fib6_info_release(f6i);
err = -ENOMEM;
goto out;
}
@@ -286,7 +285,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
aca_get(aca);
write_unlock_bh(&idev->lock);
- ip6_ins_rt(rt);
+ ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
@@ -328,8 +327,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
write_unlock_bh(&idev->lock);
addrconf_leave_solict(idev, &aca->aca_addr);
- dst_hold(&aca->aca_rt->dst);
- ip6_del_rt(aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
return 0;
@@ -356,8 +354,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
addrconf_leave_solict(idev, &aca->aca_addr);
- dst_hold(&aca->aca_rt->dst);
- ip6_del_rt(aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bc68eb661970..5bc2bf3733ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = {
static int ipv6_destopt_rcv(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
__u16 dstbuf;
@@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ __IP6_INC_STATS(dev_net(dst->dev), idev,
IPSTATS_MIB_INHDRERRORS);
fail_and_free:
kfree_skb(skb);
@@ -319,8 +320,7 @@ fail_and_free:
return 1;
}
- __IP6_INC_STATS(dev_net(dst->dev),
- ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return -1;
}
@@ -416,8 +416,7 @@ looped_back:
}
if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((&hdr->segments_left) -
skb_network_header(skb)));
@@ -456,8 +455,7 @@ looped_back:
if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
kfree_skb(skb);
@@ -481,10 +479,10 @@ looped_back:
/* called with rcu_read_lock() */
static int ipv6_rthdr_rcv(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
struct in6_addr *addr = NULL;
struct in6_addr daddr;
- struct inet6_dev *idev;
int n, i;
struct ipv6_rt_hdr *hdr;
struct rt0_hdr *rthdr;
@@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
skb->pkt_type != PACKET_HOST) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -527,7 +523,7 @@ looped_back:
* processed by own
*/
if (!addr) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ __IP6_INC_STATS(net, idev,
IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
@@ -553,8 +549,7 @@ looped_back:
goto unknown_rh;
/* Silently discard invalid RTH type 2 */
if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -572,8 +567,7 @@ looped_back:
n = hdr->hdrlen >> 1;
if (hdr->segments_left > n) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
((&hdr->segments_left) -
skb_network_header(skb)));
@@ -609,14 +603,12 @@ looped_back:
if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
(xfrm_address_t *)&ipv6_hdr(skb)->saddr,
IPPROTO_ROUTING) < 0) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -627,8 +619,7 @@ looped_back:
}
if (ipv6_addr_is_multicast(addr)) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
}
@@ -647,8 +638,7 @@ looped_back:
if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
0);
kfree_skb(skb);
@@ -663,7 +653,7 @@ looped_back:
return -1;
unknown_rh:
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
(&hdr->type) - skb_network_header(skb));
return -1;
@@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
const unsigned char *nh = skb_network_header(skb);
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct net *net = ipv6_skb_net(skb);
u32 pkt_len;
if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
nh[optoff+1]);
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
goto drop;
}
pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
return false;
}
if (ipv6_hdr(skb)->payload_len) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- __IP6_INC_STATS(net, ipv6_skb_idev(skb),
- IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index b643f5ce6c80..ae365df8abf7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
* if target < 0. "last header" is transport protocol header, ESP, or
* "No next header".
*
- * Note that *offset is used as input/output parameter. an if it is not zero,
+ * Note that *offset is used as input/output parameter, and if it is not zero,
* then it must be a valid offset to an inner IPv6 header. This can be used
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
return fib_rules_seq_read(net, AF_INET6);
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ struct fib6_info *f6i;
+ int err;
+
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = fib6_table_lookup,
+ .lookup_data = &oif,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i = arg.result ? : net->ipv6.fib6_null_entry;
+ } else {
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+ oif, fl6, flags);
+ if (!f6i || f6i == net->ipv6.fib6_null_entry)
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, flags);
+ }
+
+ return f6i;
+}
+
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup)
@@ -96,8 +129,73 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &net->ipv6.ip6_null_entry->dst;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+ struct flowi6 *flp6, const struct net_device *dev)
+{
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /* If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+ rt6_flags2srcprefs(flags), &saddr))
+ return -EAGAIN;
+
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+ return -EAGAIN;
+
+ flp6->saddr = saddr;
+ }
+
+ return 0;
+}
+
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct net *net = rule->fr_net;
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+ int err = -EAGAIN, *oif;
+ u32 tb_id;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table)
+ return -EAGAIN;
+
+ oif = (int *)arg->lookup_data;
+ f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+ if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ fib6_info_nh_dev(f6i));
+
+ if (likely(!err))
+ arg->result = f6i;
+ }
+
+ return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
{
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
@@ -134,27 +232,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
rt = lookup(net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
- struct fib6_rule *r = (struct fib6_rule *)rule;
-
- /*
- * If we need to find a source address for this traffic,
- * we check the result if it meets requirement of the rule.
- */
- if ((rule->flags & FIB_RULE_FIND_SADDR) &&
- r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
- struct in6_addr saddr;
-
- if (ipv6_dev_get_saddr(net,
- ip6_dst_idev(&rt->dst)->dev,
- &flp6->daddr,
- rt6_flags2srcprefs(flags),
- &saddr))
- goto again;
- if (!ipv6_prefix_equal(&saddr, &r->src.addr,
- r->src.plen))
- goto again;
- flp6->saddr = saddr;
- }
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ ip6_dst_idev(&rt->dst)->dev);
+
+ if (err == -EAGAIN)
+ goto again;
+
err = rt->dst.error;
if (err != -EAGAIN)
goto out;
@@ -172,6 +255,15 @@ out:
return err;
}
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ if (arg->lookup_ptr == fib6_table_lookup)
+ return fib6_rule_action_alt(rule, flp, flags, arg);
+
+ return __fib6_rule_action(rule, flp, flags, arg);
+}
+
static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
struct rt6_info *rt = (struct rt6_info *) arg->result;
@@ -245,15 +337,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
- struct nlattr **tb)
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
- if (rule->table == RT6_TABLE_UNSPEC)
+ if (rule->table == RT6_TABLE_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid table");
goto errout;
+ }
if (fib6_new_table(net, rule->table) == NULL) {
err = -ENOBUFS;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 01372dd74e38..7aa4c41a3bd9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly;
struct fib6_cleaner {
struct fib6_walker w;
struct net *net;
- int (*func)(struct rt6_info *, void *arg);
+ int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
};
@@ -54,7 +54,7 @@ struct fib6_cleaner {
#define FWS_INIT FWS_L
#endif
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
struct fib6_table *table,
struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
@@ -105,13 +105,12 @@ enum {
FIB6_NO_SERNUM_CHANGE = 0,
};
-void fib6_update_sernum(struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
- struct net *net = dev_net(rt->dst.dev);
struct fib6_node *fn;
- fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ fn = rcu_dereference_protected(f6i->fib6_node,
+ lockdep_is_held(&f6i->fib6_table->tb6_lock));
if (fn)
fn->fn_sernum = fib6_new_sernum(net);
}
@@ -146,6 +145,69 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+{
+ struct fib6_info *f6i;
+
+ f6i = kzalloc(sizeof(*f6i), gfp_flags);
+ if (!f6i)
+ return NULL;
+
+ f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!f6i->rt6i_pcpu) {
+ kfree(f6i);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&f6i->fib6_siblings);
+ f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+
+ atomic_inc(&f6i->fib6_ref);
+
+ return f6i;
+}
+
+void fib6_info_destroy(struct fib6_info *f6i)
+{
+ struct rt6_exception_bucket *bucket;
+ struct dst_metrics *m;
+
+ WARN_ON(f6i->fib6_node);
+
+ bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
+ if (bucket) {
+ f6i->rt6i_exception_bucket = NULL;
+ kfree(bucket);
+ }
+
+ if (f6i->rt6i_pcpu) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ *ppcpu_rt = NULL;
+ }
+ }
+ }
+
+ if (f6i->fib6_nh.nh_dev)
+ dev_put(f6i->fib6_nh.nh_dev);
+
+ m = f6i->fib6_metrics;
+ if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+ kfree(m);
+
+ kfree(f6i);
+}
+EXPORT_SYMBOL_GPL(fib6_info_destroy);
+
static struct fib6_node *node_alloc(struct net *net)
{
struct fib6_node *fn;
@@ -176,28 +238,6 @@ static void node_free(struct net *net, struct fib6_node *fn)
net->ipv6.rt6_stats->fib_nodes--;
}
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
-{
- int cpu;
-
- if (!non_pcpu_rt->rt6i_pcpu)
- return;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
-}
-EXPORT_SYMBOL_GPL(rt6_free_pcpu);
-
static void fib6_free_table(struct fib6_table *table)
{
inetpeer_invalidate_tree(&table->tb6_peers);
@@ -232,7 +272,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
if (table) {
table->tb6_id = id;
rcu_assign_pointer(table->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&table->tb6_peers);
}
@@ -314,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &rt->dst;
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
static void __net_init fib6_tables_init(struct net *net)
{
fib6_link_table(net, net->ipv6.fib6_main_tbl);
@@ -340,7 +387,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type,
- struct rt6_info *rt)
+ struct fib6_info *rt)
{
struct fib6_entry_notifier_info info = {
.rt = rt,
@@ -351,7 +398,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
static int call_fib6_entry_notifiers(struct net *net,
enum fib_event_type event_type,
- struct rt6_info *rt,
+ struct fib6_info *rt,
struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
@@ -359,7 +406,7 @@ static int call_fib6_entry_notifiers(struct net *net,
.rt = rt,
};
- rt->rt6i_table->fib_seq++;
+ rt->fib6_table->fib_seq++;
return call_fib6_notifiers(net, event_type, &info.info);
}
@@ -368,16 +415,16 @@ struct fib6_dump_arg {
struct notifier_block *nb;
};
-static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
- if (rt == arg->net->ipv6.ip6_null_entry)
+ if (rt == arg->net->ipv6.fib6_null_entry)
return;
call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
}
static int fib6_node_dump(struct fib6_walker *w)
{
- struct rt6_info *rt;
+ struct fib6_info *rt;
for_each_fib6_walker_rt(w)
fib6_rt_dump(rt, w->args);
@@ -426,7 +473,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
static int fib6_dump_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
+ struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
res = rt6_dump_route(rt, w->args);
@@ -441,10 +488,10 @@ static int fib6_dump_node(struct fib6_walker *w)
* last sibling of this route (no need to dump the
* sibling routes again)
*/
- if (rt->rt6i_nsiblings)
- rt = list_last_entry(&rt->rt6i_siblings,
- struct rt6_info,
- rt6i_siblings);
+ if (rt->fib6_nsiblings)
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
w->leaf = NULL;
return 0;
@@ -579,6 +626,24 @@ out:
return res;
}
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
+{
+ if (!f6i)
+ return;
+
+ if (f6i->fib6_metrics == &dst_default_metrics) {
+ struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
+
+ if (!p)
+ return;
+
+ refcount_set(&p->refcnt, 1);
+ f6i->fib6_metrics = p;
+ }
+
+ f6i->fib6_metrics->metrics[metric - 1] = val;
+}
+
/*
* Routing Table
*
@@ -608,7 +673,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
fn = root;
do {
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
key = (struct rt6key *)((u8 *)leaf + offset);
@@ -637,11 +702,11 @@ static struct fib6_node *fib6_add_1(struct net *net,
/* clean up an intermediate node */
if (!(fn->fn_flags & RTN_RTINFO)) {
RCU_INIT_POINTER(fn->leaf, NULL);
- rt6_release(leaf);
+ fib6_info_release(leaf);
/* remove null_entry in the root node */
} else if (fn->fn_flags & RTN_TL_ROOT &&
rcu_access_pointer(fn->leaf) ==
- net->ipv6.ip6_null_entry) {
+ net->ipv6.fib6_null_entry) {
RCU_INIT_POINTER(fn->leaf, NULL);
}
@@ -750,7 +815,7 @@ insert_above:
RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
atomic_inc(&rcu_dereference_protected(in->leaf,
- lockdep_is_held(&table->tb6_lock))->rt6i_ref);
+ lockdep_is_held(&table->tb6_lock))->fib6_ref);
/* update parent pointer */
if (dir)
@@ -802,44 +867,37 @@ insert_above:
return ln;
}
-static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
-{
- int i;
-
- for (i = 0; i < RTAX_MAX; i++) {
- if (test_bit(i, mxc->mx_valid))
- mp[i] = mxc->mx[i];
- }
-}
-
-static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+ const struct fib6_table *table)
{
- if (!mxc->mx)
- return 0;
-
- if (dst->flags & DST_HOST) {
- u32 *mp = dst_metrics_write_ptr(dst);
+ int cpu;
- if (unlikely(!mp))
- return -ENOMEM;
+ /* release the reference to this fib entry from
+ * all of its cached pcpu routes
+ */
+ for_each_possible_cpu(cpu) {
+ struct rt6_info **ppcpu_rt;
+ struct rt6_info *pcpu_rt;
- fib6_copy_metrics(mp, mxc);
- } else {
- dst_init_metrics(dst, mxc->mx, false);
+ ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+ pcpu_rt = *ppcpu_rt;
+ if (pcpu_rt) {
+ struct fib6_info *from;
- /* We've stolen mx now. */
- mxc->mx = NULL;
+ from = rcu_dereference_protected(pcpu_rt->from,
+ lockdep_is_held(&table->tb6_lock));
+ rcu_assign_pointer(pcpu_rt->from, NULL);
+ fib6_info_release(from);
+ }
}
-
- return 0;
}
-static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
+static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct net *net)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
- if (atomic_read(&rt->rt6i_ref) != 1) {
+ if (atomic_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
@@ -847,18 +905,22 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
* to still alive ones.
*/
while (fn) {
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *new_leaf;
+ struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
- atomic_inc(&new_leaf->rt6i_ref);
+ atomic_inc(&new_leaf->fib6_ref);
+
rcu_assign_pointer(fn->leaf, new_leaf);
- rt6_release(rt);
+ fib6_info_release(rt);
}
fn = rcu_dereference_protected(fn->parent,
lockdep_is_held(&table->tb6_lock));
}
+
+ if (rt->rt6i_pcpu)
+ fib6_drop_pcpu_from(rt, table);
}
}
@@ -866,37 +928,37 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
* Insert routing information in a node.
*/
-static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
- struct nl_info *info, struct mx6_config *mxc,
+static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info,
struct netlink_ext_ack *extack)
{
- struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- struct rt6_info *iter = NULL;
- struct rt6_info __rcu **ins;
- struct rt6_info __rcu **fallback_ins = NULL;
+ struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_info *iter = NULL, *match = NULL;
+ struct fib6_info __rcu **ins;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
+ int append = (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
- bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
int err;
- if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+ if (append)
nlflags |= NLM_F_APPEND;
ins = &fn->leaf;
for (iter = leaf; iter;
- iter = rcu_dereference_protected(iter->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock))) {
/*
* Search for duplicates
*/
- if (iter->rt6i_metric == rt->rt6i_metric) {
+ if (iter->fib6_metric == rt->fib6_metric) {
/*
* Same priority level
*/
@@ -906,56 +968,32 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
nlflags &= ~NLM_F_EXCL;
if (replace) {
- if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
- found++;
- break;
- }
- if (rt_can_ecmp)
- fallback_ins = fallback_ins ?: ins;
- goto next_iter;
+ found++;
+ break;
}
if (rt6_duplicate_nexthop(iter, rt)) {
- if (rt->rt6i_nsiblings)
- rt->rt6i_nsiblings = 0;
- if (!(iter->rt6i_flags & RTF_EXPIRES))
+ if (rt->fib6_nsiblings)
+ rt->fib6_nsiblings = 0;
+ if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
- if (!(rt->rt6i_flags & RTF_EXPIRES))
- rt6_clean_expires(iter);
+ if (!(rt->fib6_flags & RTF_EXPIRES))
+ fib6_clean_expires(iter);
else
- rt6_set_expires(iter, rt->dst.expires);
- iter->rt6i_pmtu = rt->rt6i_pmtu;
+ fib6_set_expires(iter, rt->expires);
+ fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST;
}
- /* If we have the same destination and the same metric,
- * but not the same gateway, then the route we try to
- * add is sibling to this route, increment our counter
- * of siblings, and later we will add our route to the
- * list.
- * Only static routes (which don't have flag
- * RTF_EXPIRES) are used for ECMPv6.
- *
- * To avoid long list, we only had siblings if the
- * route have a gateway.
- */
- if (rt_can_ecmp &&
- rt6_qualify_for_ecmp(iter))
- rt->rt6i_nsiblings++;
+
+ /* first route that matches */
+ if (!match)
+ match = iter;
}
- if (iter->rt6i_metric > rt->rt6i_metric)
+ if (iter->fib6_metric > rt->fib6_metric)
break;
-next_iter:
- ins = &iter->rt6_next;
- }
-
- if (fallback_ins && !found) {
- /* No ECMP-able route found, replace first non-ECMP one */
- ins = fallback_ins;
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- found++;
+ ins = &iter->fib6_next;
}
/* Reset round-robin state, if necessary */
@@ -963,59 +1001,56 @@ next_iter:
fn->rr_ptr = NULL;
/* Link this route to others same route. */
- if (rt->rt6i_nsiblings) {
- unsigned int rt6i_nsiblings;
- struct rt6_info *sibling, *temp_sibling;
-
- /* Find the first route that have the same metric */
- sibling = leaf;
- while (sibling) {
- if (sibling->rt6i_metric == rt->rt6i_metric &&
- rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->rt6i_siblings,
- &sibling->rt6i_siblings);
- break;
- }
- sibling = rcu_dereference_protected(sibling->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ if (append && match) {
+ struct fib6_info *sibling, *temp_sibling;
+
+ if (rt->fib6_flags & RTF_REJECT) {
+ NL_SET_ERR_MSG(extack,
+ "Can not append a REJECT route");
+ return -EINVAL;
+ } else if (match->fib6_flags & RTF_REJECT) {
+ NL_SET_ERR_MSG(extack,
+ "Can not append to a REJECT route");
+ return -EINVAL;
}
+ rt->fib6_nsiblings = match->fib6_nsiblings;
+ list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
+ match->fib6_nsiblings++;
+
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
- rt6i_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
- &rt->rt6i_siblings, rt6i_siblings) {
- sibling->rt6i_nsiblings++;
- BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
- rt6i_nsiblings++;
+ &match->fib6_siblings, fib6_siblings) {
+ sibling->fib6_nsiblings++;
+ BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
}
- BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
- rt6_multipath_rebalance(temp_sibling);
+
+ rt6_multipath_rebalance(match);
}
/*
* insert node
*/
if (!replace) {
+ enum fib_event_type event;
+
if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
nlflags |= NLM_F_CREATE;
- err = fib6_commit_metrics(&rt->dst, mxc);
- if (err)
- return err;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
+ event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
+ err = call_fib6_entry_notifiers(info->nl_net, event, rt,
+ extack);
if (err)
return err;
- rcu_assign_pointer(rt->rt6_next, iter);
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
+ rcu_assign_pointer(rt->fib6_next, iter);
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1027,7 +1062,7 @@ add:
}
} else {
- int nsiblings;
+ struct fib6_info *tmp;
if (!found) {
if (add)
@@ -1036,67 +1071,72 @@ add:
return -ENOENT;
}
- err = fib6_commit_metrics(&rt->dst, mxc);
- if (err)
- return err;
-
err = call_fib6_entry_notifiers(info->nl_net,
FIB_EVENT_ENTRY_REPLACE,
rt, extack);
if (err)
return err;
- atomic_inc(&rt->rt6i_ref);
- rcu_assign_pointer(rt->rt6i_node, fn);
- rt->rt6_next = iter->rt6_next;
+ /* if route being replaced has siblings, set tmp to
+ * last one, otherwise tmp is current route. this is
+ * used to set fib6_next for new route
+ */
+ if (iter->fib6_nsiblings)
+ tmp = list_last_entry(&iter->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ else
+ tmp = iter;
+
+ /* insert new route */
+ atomic_inc(&rt->fib6_ref);
+ rcu_assign_pointer(rt->fib6_node, fn);
+ rt->fib6_next = tmp->fib6_next;
rcu_assign_pointer(*ins, rt);
+
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
- nsiblings = iter->rt6i_nsiblings;
- iter->rt6i_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == iter)
- fn->rr_ptr = NULL;
- rt6_release(iter);
- if (nsiblings) {
+ /* delete old route */
+ rt = iter;
+
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *tmp;
+
/* Replacing an ECMP route, remove all siblings */
- ins = &rt->rt6_next;
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- while (iter) {
- if (iter->rt6i_metric > rt->rt6i_metric)
- break;
- if (rt6_qualify_for_ecmp(iter)) {
- *ins = iter->rt6_next;
- iter->rt6i_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == iter)
- fn->rr_ptr = NULL;
- rt6_release(iter);
- nsiblings--;
- info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
- } else {
- ins = &iter->rt6_next;
- }
- iter = rcu_dereference_protected(*ins,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
+ fib6_siblings) {
+ iter->fib6_node = NULL;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+ fib6_info_release(iter);
+
+ rt->fib6_nsiblings--;
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
}
- WARN_ON(nsiblings != 0);
}
+
+ WARN_ON(rt->fib6_nsiblings != 0);
+
+ rt->fib6_node = NULL;
+ fib6_purge_rt(rt, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == rt)
+ fn->rr_ptr = NULL;
+ fib6_info_release(rt);
}
return 0;
}
-static void fib6_start_gc(struct net *net, struct rt6_info *rt)
+static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
- (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+ (rt->fib6_flags & RTF_EXPIRES))
mod_timer(&net->ipv6.ip6_fib_timer,
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
@@ -1108,22 +1148,22 @@ void fib6_force_start_gc(struct net *net)
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}
-static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
+static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
int sernum)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
/* paired with smp_rmb() in rt6_get_cookie_safe() */
smp_wmb();
while (fn) {
fn->fn_sernum = sernum;
fn = rcu_dereference_protected(fn->parent,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
}
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
@@ -1135,22 +1175,16 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
* Need to own table->tb6_lock
*/
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
- struct nl_info *info, struct mx6_config *mxc,
- struct netlink_ext_ack *extack)
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack)
{
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_table *table = rt->fib6_table;
struct fib6_node *fn, *pn = NULL;
int err = -ENOMEM;
int allow_create = 1;
int replace_required = 0;
int sernum = fib6_new_sernum(info->nl_net);
- if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
- return -EINVAL;
- if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0;
@@ -1161,8 +1195,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
fn = fib6_add_1(info->nl_net, table, root,
- &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
- offsetof(struct rt6_info, rt6i_dst), allow_create,
+ &rt->fib6_dst.addr, rt->fib6_dst.plen,
+ offsetof(struct fib6_info, fib6_dst), allow_create,
replace_required, extack);
if (IS_ERR(fn)) {
err = PTR_ERR(fn);
@@ -1173,7 +1207,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
pn = fn;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt->rt6i_src.plen) {
+ if (rt->fib6_src.plen) {
struct fib6_node *sn;
if (!rcu_access_pointer(fn->subtree)) {
@@ -1194,16 +1228,16 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (!sfn)
goto failure;
- atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+ atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
rcu_assign_pointer(sfn->leaf,
- info->nl_net->ipv6.ip6_null_entry);
+ info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
/* Now add the first leaf node to new subtree */
sn = fib6_add_1(info->nl_net, table, sfn,
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1221,8 +1255,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
rcu_assign_pointer(fn->subtree, sfn);
} else {
sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
- &rt->rt6i_src.addr, rt->rt6i_src.plen,
- offsetof(struct rt6_info, rt6i_src),
+ &rt->fib6_src.addr, rt->fib6_src.plen,
+ offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
@@ -1235,9 +1269,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (fn->fn_flags & RTN_TL_ROOT) {
/* put back null_entry for root node */
rcu_assign_pointer(fn->leaf,
- info->nl_net->ipv6.ip6_null_entry);
+ info->nl_net->ipv6.fib6_null_entry);
} else {
- atomic_inc(&rt->rt6i_ref);
+ atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(fn->leaf, rt);
}
}
@@ -1245,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, mxc, extack);
+ err = fib6_add_rt2node(fn, rt, info, extack);
if (!err) {
__fib6_update_sernum_upto_root(rt, sernum);
fib6_start_gc(info->nl_net, rt);
@@ -1259,13 +1293,13 @@ out:
* super-tree leaf node we have to find a new one for it.
*/
if (pn != fn) {
- struct rt6_info *pn_leaf =
+ struct fib6_info *pn_leaf =
rcu_dereference_protected(pn->leaf,
lockdep_is_held(&table->tb6_lock));
if (pn_leaf == rt) {
pn_leaf = NULL;
RCU_INIT_POINTER(pn->leaf, NULL);
- atomic_dec(&rt->rt6i_ref);
+ fib6_info_release(rt);
}
if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
pn_leaf = fib6_find_prefix(info->nl_net, table,
@@ -1274,10 +1308,10 @@ out:
if (!pn_leaf) {
WARN_ON(!pn_leaf);
pn_leaf =
- info->nl_net->ipv6.ip6_null_entry;
+ info->nl_net->ipv6.fib6_null_entry;
}
#endif
- atomic_inc(&pn_leaf->rt6i_ref);
+ fib6_info_hold(pn_leaf);
rcu_assign_pointer(pn->leaf, pn_leaf);
}
}
@@ -1299,10 +1333,6 @@ failure:
(fn->fn_flags & RTN_TL_ROOT &&
!rcu_access_pointer(fn->leaf))))
fib6_repair_tree(info->nl_net, table, fn);
- /* Always release dst as dst->__refcnt is guaranteed
- * to be taken before entering this function
- */
- dst_release_immediate(&rt->dst);
return err;
}
@@ -1312,12 +1342,12 @@ failure:
*/
struct lookup_args {
- int offset; /* key offset on rt6_info */
+ int offset; /* key offset on fib6_info */
const struct in6_addr *addr; /* search key */
};
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
- struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
{
struct fib6_node *fn;
__be32 dir;
@@ -1350,7 +1380,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
struct fib6_node *subtree = FIB6_SUBTREE(fn);
if (subtree || fn->fn_flags & RTN_RTINFO) {
- struct rt6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
if (!leaf)
@@ -1362,7 +1392,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
#ifdef CONFIG_IPV6_SUBTREES
if (subtree) {
struct fib6_node *sfn;
- sfn = fib6_lookup_1(subtree, args + 1);
+ sfn = fib6_node_lookup_1(subtree,
+ args + 1);
if (!sfn)
goto backtrack;
fn = sfn;
@@ -1384,18 +1415,19 @@ backtrack:
/* called with rcu_read_lock() held
*/
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct fib6_node *fn;
struct lookup_args args[] = {
{
- .offset = offsetof(struct rt6_info, rt6i_dst),
+ .offset = offsetof(struct fib6_info, fib6_dst),
.addr = daddr,
},
#ifdef CONFIG_IPV6_SUBTREES
{
- .offset = offsetof(struct rt6_info, rt6i_src),
+ .offset = offsetof(struct fib6_info, fib6_src),
.addr = saddr,
},
#endif
@@ -1404,7 +1436,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
}
};
- fn = fib6_lookup_1(root, daddr ? args : args + 1);
+ fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
@@ -1431,7 +1463,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
struct fib6_node *fn, *prev = NULL;
for (fn = root; fn ; ) {
- struct rt6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct rt6key *key;
/* This node is being deleted */
@@ -1480,7 +1512,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
struct fib6_node *fn;
fn = fib6_locate_1(root, daddr, dst_len,
- offsetof(struct rt6_info, rt6i_dst),
+ offsetof(struct fib6_info, fib6_dst),
exact_match);
#ifdef CONFIG_IPV6_SUBTREES
@@ -1491,7 +1523,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
if (subtree) {
fn = fib6_locate_1(subtree, saddr, src_len,
- offsetof(struct rt6_info, rt6i_src),
+ offsetof(struct fib6_info, fib6_src),
exact_match);
}
}
@@ -1510,14 +1542,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
*
*/
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
struct fib6_table *table,
struct fib6_node *fn)
{
struct fib6_node *child_left, *child_right;
if (fn->fn_flags & RTN_ROOT)
- return net->ipv6.ip6_null_entry;
+ return net->ipv6.fib6_null_entry;
while (fn) {
child_left = rcu_dereference_protected(fn->left,
@@ -1554,7 +1586,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
/* Set fn->leaf to null_entry for root node. */
if (fn->fn_flags & RTN_TL_ROOT) {
- rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+ rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
return fn;
}
@@ -1569,11 +1601,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
lockdep_is_held(&table->tb6_lock));
struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+ struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+ struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
lockdep_is_held(&table->tb6_lock));
- struct rt6_info *new_fn_leaf;
+ struct fib6_info *new_fn_leaf;
RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
iter++;
@@ -1599,10 +1631,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
#if RT6_DEBUG >= 2
if (!new_fn_leaf) {
WARN_ON(!new_fn_leaf);
- new_fn_leaf = net->ipv6.ip6_null_entry;
+ new_fn_leaf = net->ipv6.fib6_null_entry;
}
#endif
- atomic_inc(&new_fn_leaf->rt6i_ref);
+ fib6_info_hold(new_fn_leaf);
rcu_assign_pointer(fn->leaf, new_fn_leaf);
return pn;
}
@@ -1658,26 +1690,24 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
return pn;
RCU_INIT_POINTER(pn->leaf, NULL);
- rt6_release(pn_leaf);
+ fib6_info_release(pn_leaf);
fn = pn;
}
}
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
- struct rt6_info __rcu **rtp, struct nl_info *info)
+ struct fib6_info __rcu **rtp, struct nl_info *info)
{
struct fib6_walker *w;
- struct rt6_info *rt = rcu_dereference_protected(*rtp,
+ struct fib6_info *rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
RT6_TRACE("fib6_del_route\n");
- WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
-
/* Unlink it */
- *rtp = rt->rt6_next;
- rt->rt6i_node = NULL;
+ *rtp = rt->fib6_next;
+ rt->fib6_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1689,14 +1719,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fn->rr_ptr = NULL;
/* Remove this entry from other siblings */
- if (rt->rt6i_nsiblings) {
- struct rt6_info *sibling, *next_sibling;
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *sibling, *next_sibling;
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings, rt6i_siblings)
- sibling->rt6i_nsiblings--;
- rt->rt6i_nsiblings = 0;
- list_del_init(&rt->rt6i_siblings);
+ &rt->fib6_siblings, fib6_siblings)
+ sibling->fib6_nsiblings--;
+ rt->fib6_nsiblings = 0;
+ list_del_init(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
@@ -1705,7 +1735,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
- w->leaf = rcu_dereference_protected(rt->rt6_next,
+ w->leaf = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
w->state = FWS_U;
@@ -1730,46 +1760,36 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
- rt6_release(rt);
+ fib6_info_release(rt);
}
/* Need to own table->tb6_lock */
-int fib6_del(struct rt6_info *rt, struct nl_info *info)
+int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
- struct fib6_table *table = rt->rt6i_table;
+ struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ struct fib6_table *table = rt->fib6_table;
struct net *net = info->nl_net;
- struct rt6_info __rcu **rtp;
- struct rt6_info __rcu **rtp_next;
+ struct fib6_info __rcu **rtp;
+ struct fib6_info __rcu **rtp_next;
-#if RT6_DEBUG >= 2
- if (rt->dst.obsolete > 0) {
- WARN_ON(fn);
- return -ENOENT;
- }
-#endif
- if (!fn || rt == net->ipv6.ip6_null_entry)
+ if (!fn || rt == net->ipv6.fib6_null_entry)
return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
- /* remove cached dst from exception table */
- if (rt->rt6i_flags & RTF_CACHE)
- return rt6_remove_exception_rt(rt);
-
/*
* Walk the leaf entries looking for ourself
*/
for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
- struct rt6_info *cur = rcu_dereference_protected(*rtp,
+ struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
fib6_del_route(table, fn, rtp, info);
return 0;
}
- rtp_next = &cur->rt6_next;
+ rtp_next = &cur->fib6_next;
}
return -ENOENT;
}
@@ -1907,7 +1927,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w)
static int fib6_clean_node(struct fib6_walker *w)
{
int res;
- struct rt6_info *rt;
+ struct fib6_info *rt;
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
@@ -1932,17 +1952,17 @@ static int fib6_clean_node(struct fib6_walker *w)
#if RT6_DEBUG >= 2
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
__func__, rt,
- rcu_access_pointer(rt->rt6i_node),
+ rcu_access_pointer(rt->fib6_node),
res);
#endif
continue;
}
return 0;
} else if (res == -2) {
- if (WARN_ON(!rt->rt6i_nsiblings))
+ if (WARN_ON(!rt->fib6_nsiblings))
continue;
- rt = list_last_entry(&rt->rt6i_siblings,
- struct rt6_info, rt6i_siblings);
+ rt = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info, fib6_siblings);
continue;
}
WARN_ON(res != 0);
@@ -1961,7 +1981,7 @@ static int fib6_clean_node(struct fib6_walker *w)
*/
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
- int (*func)(struct rt6_info *, void *arg),
+ int (*func)(struct fib6_info *, void *arg),
int sernum, void *arg)
{
struct fib6_cleaner c;
@@ -1979,7 +1999,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
}
static void __fib6_clean_all(struct net *net,
- int (*func)(struct rt6_info *, void *),
+ int (*func)(struct fib6_info *, void *),
int sernum, void *arg)
{
struct fib6_table *table;
@@ -1999,7 +2019,7 @@ static void __fib6_clean_all(struct net *net,
rcu_read_unlock();
}
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
@@ -2016,7 +2036,7 @@ static void fib6_flush_trees(struct net *net)
* Garbage collection
*/
-static int fib6_age(struct rt6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, void *arg)
{
struct fib6_gc_args *gc_args = arg;
unsigned long now = jiffies;
@@ -2026,8 +2046,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
* Routes are expired even if they are in use.
*/
- if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
- if (time_after(now, rt->dst.expires)) {
+ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
+ if (time_after(now, rt->expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
}
@@ -2110,7 +2130,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2122,7 +2142,7 @@ static int __net_init fib6_net_init(struct net *net)
goto out_fib6_main_tbl;
net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
- net->ipv6.ip6_null_entry);
+ net->ipv6.fib6_null_entry);
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2211,25 +2231,26 @@ void fib6_gc_cleanup(void)
#ifdef CONFIG_PROC_FS
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
- struct rt6_info *rt = v;
+ struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ const struct net_device *dev;
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
- seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+ seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->rt6i_flags & RTF_GATEWAY)
- seq_printf(seq, "%pi6", &rt->rt6i_gateway);
+ if (rt->fib6_flags & RTF_GATEWAY)
+ seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
else
seq_puts(seq, "00000000000000000000000000000000");
+ dev = rt->fib6_nh.nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
- rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
- rt->dst.__use, rt->rt6i_flags,
- rt->dst.dev ? rt->dst.dev->name : "");
+ rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+ rt->fib6_flags, dev ? dev->name : "");
iter->w.leaf = NULL;
return 0;
}
@@ -2243,7 +2264,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
do {
iter->w.leaf = rcu_dereference_protected(
- iter->w.leaf->rt6_next,
+ iter->w.leaf->fib6_next,
lockdep_is_held(&iter->tbl->tb6_lock));
iter->skip--;
if (!iter->skip && iter->w.leaf)
@@ -2302,14 +2323,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
int r;
- struct rt6_info *n;
+ struct fib6_info *n;
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
if (!v)
goto iter_table;
- n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
+ n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
if (n) {
++*pos;
return n;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 458de353f5d9..c8cf2fdbb13b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -848,7 +848,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
}
/**
- * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
* @t: the outgoing tunnel device
* @hdr: IPv6 header from the incoming packet
*
@@ -937,6 +937,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct flowi6 fl6;
int err = -EINVAL;
__u32 mtu;
+ int nhoff;
+ int thoff;
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
@@ -949,6 +951,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
truncate = true;
}
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IP) &&
+ (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+ truncate = true;
+
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+ truncate = true;
+
if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
goto tx_err;
@@ -1376,6 +1388,7 @@ static void ip6gre_dev_free(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+ gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
}
@@ -1443,11 +1456,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
return -ENOMEM;
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (ret) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
- return ret;
- }
+ if (ret)
+ goto cleanup_alloc_pcpu_stats;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
t_hlen = ip6gre_calc_hlen(tunnel);
dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1463,6 +1477,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
ip6gre_tnl_init_features(dev);
return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
}
static int ip6gre_tunnel_init(struct net_device *dev)
@@ -1822,11 +1843,12 @@ static int ip6erspan_tap_init(struct net_device *dev)
return -ENOMEM;
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (ret) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
- return ret;
- }
+ if (ret)
+ goto cleanup_alloc_pcpu_stats;
+
+ ret = gro_cells_init(&tunnel->gro_cells, dev);
+ if (ret)
+ goto cleanup_dst_cache_init;
t_hlen = ip6erspan_calc_hlen(tunnel);
dev->mtu = ETH_DATA_LEN - t_hlen;
@@ -1839,6 +1861,13 @@ static int ip6erspan_tap_init(struct net_device *dev)
ip6erspan_tnl_link_config(tunnel, 1);
return 0;
+
+cleanup_dst_cache_init:
+ dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+ free_percpu(dev->tstats);
+ dev->tstats = NULL;
+ return ret;
}
static const struct net_device_ops ip6erspan_netdev_ops = {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..f08d34491ece 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb)
bool deliver;
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
- ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+ __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
skb->len);
hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
if (skb->encapsulation &&
skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
- udpfrag = proto == IPPROTO_UDP && encap;
+ udpfrag = proto == IPPROTO_UDP && encap &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7b6d1689087b..021e5aef6ba3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
- unsigned int mtu;
- struct inet6_dev *idev;
-
- if (dst_metric_locked(dst, RTAX_MTU)) {
- mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
- return mtu;
- }
-
- mtu = IPV6_MIN_MTU;
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- if (idev)
- mtu = idev->cnf.mtu6;
- rcu_read_unlock();
-
- return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
@@ -425,6 +403,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
int ip6_forward(struct sk_buff *skb)
{
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
@@ -444,8 +423,7 @@ int ip6_forward(struct sk_buff *skb)
goto drop;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -476,8 +454,7 @@ int ip6_forward(struct sk_buff *skb)
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
kfree_skb(skb);
return -ETIMEDOUT;
@@ -490,15 +467,13 @@ int ip6_forward(struct sk_buff *skb)
if (proxied > 0)
return ip6_input(skb);
else if (proxied < 0) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
if (!xfrm6_route_forward(skb)) {
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INDISCARDS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
dst = skb_dst(skb);
@@ -507,7 +482,8 @@ int ip6_forward(struct sk_buff *skb)
send redirects to source routed frames.
We don't send redirects to frames decapsulated from IPsec.
*/
- if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
+ if (IP6CB(skb)->iif == dst->dev->ifindex &&
+ opt->srcrt == 0 && !skb_sec_path(skb)) {
struct in6_addr *target = NULL;
struct inet_peer *peer;
struct rt6_info *rt;
@@ -554,8 +530,7 @@ int ip6_forward(struct sk_buff *skb)
/* Again, force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INTOOBIGERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
@@ -579,7 +554,7 @@ int ip6_forward(struct sk_buff *skb)
ip6_forward_finish);
error:
- __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
drop:
kfree_skb(skb);
return -EINVAL;
@@ -966,15 +941,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* that's why we try it again later.
*/
if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ struct fib6_info *from;
struct rt6_info *rt;
bool had_dst = *dst != NULL;
if (!had_dst)
*dst = ip6_route_output(net, sk, fl6);
rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
- err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+
+ rcu_read_lock();
+ from = rt ? rcu_dereference(rt->from) : NULL;
+ err = ip6_route_get_saddr(net, from, &fl6->daddr,
sk ? inet6_sk(sk)->srcprefs : 0,
&fl6->saddr);
+ rcu_read_unlock();
+
if (err)
goto out_err_release;
@@ -1238,6 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
+ cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
@@ -1272,6 +1255,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1279,7 +1263,8 @@ static int __ip6_append_data(struct sock *sk,
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
- mtu = cork->fragsize;
+ paged = !!cork->gso_size;
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1327,7 +1312,7 @@ emsgsize:
if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
headersize == sizeof(struct ipv6hdr) &&
length <= mtu - headersize &&
- !(flags & MSG_MORE) &&
+ (!(flags & MSG_MORE) || cork->gso_size) &&
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
@@ -1370,6 +1355,7 @@ emsgsize:
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1392,11 +1378,17 @@ alloc_new_skb:
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ else if (!paged)
+ alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += dst_exthdrlen;
@@ -1418,7 +1410,7 @@ alloc_new_skb:
*/
alloclen += sizeof(struct frag_hdr);
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
@@ -1457,7 +1449,7 @@ alloc_new_skb:
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
@@ -1480,7 +1472,7 @@ alloc_new_skb:
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
@@ -1754,9 +1746,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
+ struct inet_cork_full *cork,
const struct sockcm_cookie *sockc)
{
- struct inet_cork_full cork;
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1767,27 +1759,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
__skb_queue_head_init(&queue);
- cork.base.flags = 0;
- cork.base.addr = 0;
- cork.base.opt = NULL;
- cork.base.dst = NULL;
+ cork->base.flags = 0;
+ cork->base.addr = 0;
+ cork->base.opt = NULL;
+ cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
if (err) {
- ip6_cork_release(&cork, &v6_cork);
+ ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
}
if (ipc6->dontfrag < 0)
ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+ err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags, ipc6, sockc);
if (err) {
- __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+ __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
}
- return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+ return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ca957dd93a29..b7f28deddaea 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
}
/**
- * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * vti6_ioctl - configure vti6 tunnels from userspace
* @dev: virtual device associated with tunnel
* @ifr: parameters passed from userspace
* @cmd: command to be performed
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 4a15529d33eb..0d0f0053bb11 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
};
static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh, struct nlattr **tb)
+ struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -227,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
INIT_LIST_HEAD(&net->ipv6.mr6_tables);
mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
- if (!mrt) {
- err = -ENOMEM;
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
goto err1;
}
@@ -301,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
static int __net_init ip6mr_rules_init(struct net *net)
{
- net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
- return net->ipv6.mrt6 ? 0 : -ENOMEM;
+ struct mr_table *mrt;
+
+ mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv6.mrt6 = mrt;
+ return 0;
}
static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1733,9 +1739,11 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
rtnl_lock();
ret = 0;
- if (!ip6mr_new_table(net, v))
- ret = -ENOMEM;
- raw6_sk(sk)->ip6mr_table = v;
+ mrt = ip6mr_new_table(net, v);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
+ else
+ raw6_sk(sk)->ip6mr_table = v;
rtnl_unlock();
return ret;
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9de4dfb126ba..e640d2f3c55c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1155,7 +1155,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
struct neighbour *neigh = NULL;
struct inet6_dev *in6_dev;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
+ struct net *net;
int lifetime;
struct ndisc_options ndopts;
int optlen;
@@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
/* Do not accept RA with source-addr found on local machine unless
* accept_ra_from_local is set to true.
*/
+ net = dev_net(in6_dev->dev);
if (!in6_dev->cnf.accept_ra_from_local &&
- ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- in6_dev->dev, 0)) {
+ ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
@@ -1272,20 +1273,22 @@ static void ndisc_router_discovery(struct sk_buff *skb)
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
- rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
if (rt) {
- neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+ rt->fib6_nh.nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
- ip6_rt_put(rt);
+ fib6_info_release(rt);
return;
}
}
if (rt && lifetime == 0) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
rt = NULL;
}
@@ -1294,7 +1297,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (!rt && lifetime) {
ND_PRINTK(3, info, "RA: adding default router\n");
- rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+ rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
+ skb->dev, pref);
if (!rt) {
ND_PRINTK(0, err,
"RA: %s failed to add default route\n",
@@ -1302,28 +1306,29 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+ rt->fib6_nh.nh_dev, NULL,
+ &ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
- ip6_rt_put(rt);
+ fib6_info_release(rt);
return;
}
neigh->flags |= NTF_ROUTER;
} else if (rt) {
- rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
}
if (rt)
- rt6_set_expires(rt, jiffies + (HZ * lifetime));
+ fib6_set_expires(rt, jiffies + (HZ * lifetime));
if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
ra_msg->icmph.icmp6_hop_limit) {
if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
- if (rt)
- dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
- ra_msg->icmph.icmp6_hop_limit);
+ fib6_metric_set(rt, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
} else {
ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
}
@@ -1475,10 +1480,7 @@ skip_routeinfo:
ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
} else if (in6_dev->cnf.mtu6 != mtu) {
in6_dev->cnf.mtu6 = mtu;
-
- if (rt)
- dst_metric_set(&rt->dst, RTAX_MTU, mtu);
-
+ fib6_metric_set(rt, RTAX_MTU, mtu);
rt6_mtu_change(skb->dev, mtu);
}
}
@@ -1497,7 +1499,7 @@ skip_routeinfo:
ND_PRINTK(2, warn, "RA: invalid RA options\n");
}
out:
- ip6_rt_put(rt);
+ fib6_info_release(rt);
if (neigh)
neigh_release(neigh);
}
@@ -1576,6 +1578,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
bool ret;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+ if (!dev)
+ return;
+ }
+
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
dev->name);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ce77bcc2490c..37b14dc9d863 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV6
tristate "IPv6 socket lookup support"
help
This option enables the IPv6 socket lookup infrastructure. This
- is used by the ip6tables socket match.
+ is used by the {ip6,nf}tables socket match.
+
+config NF_TPROXY_IPV6
+ tristate "IPv6 tproxy support"
if NF_TABLES
@@ -136,10 +139,7 @@ config NF_NAT_IPV6
if NF_NAT_IPV6
config NF_NAT_MASQUERADE_IPV6
- tristate "IPv6 masquerade support"
- help
- This is the kernel functionality to provide NAT in the masquerade
- flavour (automatic source address selection) for IPv6.
+ bool
endif # NF_NAT_IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 44273d6f03a5..10a5a1c87320 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -18,14 +18,15 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
# defrag
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
# logging
obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 97f79dc943d7..0758b5bcfb29 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -529,7 +529,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
.family = NFPROTO_IPV6,
};
- t = ip6t_get_target(e);
return xt_check_target(&par, t->u.target_size - sizeof(*t),
e->ipv6.proto,
e->ipv6.invflags & IP6T_INV_PROTO);
@@ -1794,6 +1793,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
/* set res now, will see skbs right after nf_register_net_hooks */
WRITE_ONCE(*res, new_table);
+ if (!ops)
+ return 0;
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
@@ -1811,7 +1812,8 @@ out_free:
void ip6t_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ops)
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
__ip6t_unregister_table(net, table);
}
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d12f511929f5..0fe61ede77c6 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
}
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ if ((flags & XT_RPFILTER_LOOSE) == 0)
+ fl6.flowi6_oif = dev->ifindex;
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
if (rt->dst.error)
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+ const struct ip6t_srh1 *srhinfo = par->matchinfo;
+ struct in6_addr *psid, *nsid, *lsid;
+ struct in6_addr _psid, _nsid, _lsid;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6_sr_hdr _srh;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return false;
+ srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+ if (!srh)
+ return false;
+
+ hdrlen = ipv6_optlen(srh);
+ if (skb->len - srhoff < hdrlen)
+ return false;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ return false;
+
+ if (srh->segments_left > srh->first_segment)
+ return false;
+
+ /* Next Header matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+ !(srh->nexthdr == srhinfo->next_hdr)))
+ return false;
+
+ /* Header Extension Length matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+ !(srh->hdrlen == srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+ !(srh->hdrlen > srhinfo->hdr_len)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+ !(srh->hdrlen < srhinfo->hdr_len)))
+ return false;
+
+ /* Segments Left matching */
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+ !(srh->segments_left == srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+ !(srh->segments_left > srhinfo->segs_left)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+ !(srh->segments_left < srhinfo->segs_left)))
+ return false;
+
+ /**
+ * Last Entry matching
+ * Last_Entry field was introduced in revision 6 of the SRH draft.
+ * It was called First_Segment in the previous revision
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+ !(srh->first_segment == srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+ !(srh->first_segment > srhinfo->last_entry)))
+ return false;
+ if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+ !(srh->first_segment < srhinfo->last_entry)))
+ return false;
+
+ /**
+ * Tag matchig
+ * Tag field was introduced in revision 6 of the SRH draft
+ */
+ if (srhinfo->mt_flags & IP6T_SRH_TAG)
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+ !(srh->tag == srhinfo->tag)))
+ return false;
+
+ /* Previous SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+ if (srh->segments_left == srh->first_segment)
+ return false;
+ psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left + 1) * sizeof(struct in6_addr));
+ psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+ ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+ &srhinfo->psid_addr)))
+ return false;
+ }
+
+ /* Next SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+ if (srh->segments_left == 0)
+ return false;
+ nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+ ((srh->segments_left - 1) * sizeof(struct in6_addr));
+ nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+ ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+ &srhinfo->nsid_addr)))
+ return false;
+ }
+
+ /* Last SID matching */
+ if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+ lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+ lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+ if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+ ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+ &srhinfo->lsid_addr)))
+ return false;
+ }
+ return true;
+}
+
static int srh_mt6_check(const struct xt_mtchk_param *par)
{
const struct ip6t_srh *srhinfo = par->matchinfo;
@@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par)
return 0;
}
-static struct xt_match srh_mt6_reg __read_mostly = {
- .name = "srh",
- .family = NFPROTO_IPV6,
- .match = srh_mt6,
- .matchsize = sizeof(struct ip6t_srh),
- .checkentry = srh_mt6_check,
- .me = THIS_MODULE,
+static int srh1_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_srh1 *srhinfo = par->matchinfo;
+
+ if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+ pr_info_ratelimited("unknown srh match flags %X\n",
+ srhinfo->mt_flags);
+ return -EINVAL;
+ }
+
+ if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+ pr_info_ratelimited("unknown srh invflags %X\n",
+ srhinfo->mt_invflags);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match srh_mt6_reg[] __read_mostly = {
+ {
+ .name = "srh",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = srh_mt6,
+ .matchsize = sizeof(struct ip6t_srh),
+ .checkentry = srh_mt6_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "srh",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = srh1_mt6,
+ .matchsize = sizeof(struct ip6t_srh1),
+ .checkentry = srh1_mt6_check,
+ .me = THIS_MODULE,
+ }
};
static int __init srh_mt6_init(void)
{
- return xt_register_match(&srh_mt6_reg);
+ return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
}
static void __exit srh_mt6_exit(void)
{
- xt_unregister_match(&srh_mt6_reg);
+ xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
}
module_init(srh_mt6_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 47306e45a80a..67ba70ab9f5c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -35,75 +35,63 @@ static const struct xt_table nf_nat_ipv6_table = {
static unsigned int ip6table_nat_do_chain(void *priv,
struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
-}
-
-static unsigned int ip6table_nat_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_out(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_local_fn(void *priv,
- struct sk_buff *skb,
const struct nf_hook_state *state)
{
- return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain);
+ return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
}
static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
- /* Before packet filtering, change destination */
{
- .hook = ip6table_nat_in,
+ .hook = ip6table_nat_do_chain,
.pf = NFPROTO_IPV6,
- .nat_hook = true,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = ip6table_nat_out,
+ .hook = ip6table_nat_do_chain,
.pf = NFPROTO_IPV6,
- .nat_hook = true,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_NAT_SRC,
},
- /* Before packet filtering, change destination */
{
- .hook = ip6table_nat_local_fn,
+ .hook = ip6table_nat_do_chain,
.pf = NFPROTO_IPV6,
- .nat_hook = true,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST,
},
- /* After packet filtering, change source */
{
- .hook = ip6table_nat_fn,
- .nat_hook = true,
+ .hook = ip6table_nat_do_chain,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
},
};
+static int ip6t_nat_register_lookups(struct net *net)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
+ ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
+ if (ret) {
+ while (i)
+ nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
+
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void ip6t_nat_unregister_lookups(struct net *net)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
+ nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
+}
+
static int __net_init ip6table_nat_table_init(struct net *net)
{
struct ip6t_replace *repl;
@@ -116,7 +104,17 @@ static int __net_init ip6table_nat_table_init(struct net *net)
if (repl == NULL)
return -ENOMEM;
ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
- nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
+ NULL, &net->ipv6.ip6table_nat);
+ if (ret < 0) {
+ kfree(repl);
+ return ret;
+ }
+
+ ret = ip6t_nat_register_lookups(net);
+ if (ret < 0) {
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
+ net->ipv6.ip6table_nat = NULL;
+ }
kfree(repl);
return ret;
}
@@ -125,7 +123,8 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
{
if (!net->ipv6.ip6table_nat)
return;
- ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+ ip6t_nat_unregister_lookups(net);
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
net->ipv6.ip6table_nat = NULL;
}
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 207cb35569b1..c511d206bf9b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -3,256 +3,12 @@
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
-#include <linux/ipv6.h>
-#include <linux/netdevice.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
- new_addr->s6_addr32, true);
-
- return 0;
-}
-
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
- new_addr->s6_addr32, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- switch (ip6h->nexthdr) {
- case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- struct in6_addr addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = ip6h->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
- ip6h->saddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = ip6h->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
- ip6h->daddr = new_addr;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- struct in6_addr addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = ip6h->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
- ip6h->daddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = ip6h->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
- ip6h->saddr = new_addr;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
-{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
- unsigned int thoff = sizeof(*ip6h);
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- return 0;
-}
-
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
-{
- struct flow_ports *ports;
- struct ipv6hdr *ip6h;
- unsigned int thoff;
-
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
- return -1;
-
- ip6h = ipv6_hdr(skb);
-
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
- return -1;
-
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
- return -1;
-
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
- tuple->src_v6 = ip6h->saddr;
- tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
- tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
- tuple->iifidx = dev->ifindex;
-
- return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
- if (skb->len <= mtu)
- return false;
-
- if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
- return false;
-
- return true;
-}
-
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
- u32 mtu;
-
- mtu = ip6_dst_mtu_forward(&rt->dst);
- if (__nf_flow_exceeds_mtu(skb, mtu))
- return true;
-
- return false;
-}
-
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- struct in6_addr *nexthop;
- struct ipv6hdr *ip6h;
- struct rt6_info *rt;
-
- if (skb->protocol != htons(ETH_P_IPV6))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
-
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
- return NF_DROP;
-
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ipv6(flow, skb, dir) < 0)
- return NF_DROP;
-
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- ip6h = ipv6_hdr(skb);
- ip6h->hop_limit--;
-
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-
- return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ipv6_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6b7f075f811f..ca6d38698b1a 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
#endif
static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
@@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
if (tb[CTA_NAT_V6_MINIP]) {
nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
@@ -252,18 +252,12 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
-unsigned int
+static unsigned int
nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
__be16 frag_off;
int hdrlen;
u8 nexthdr;
@@ -277,11 +271,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
if (!ct)
return NF_ACCEPT;
- nat = nfct_nat(ct);
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
+ if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
nexthdr = ipv6_hdr(skb)->nexthdr;
hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
&nexthdr, &frag_off);
@@ -294,77 +284,29 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
else
return NF_ACCEPT;
}
- /* Only ICMPs can be IP_CT_IS_REPLY: */
- /* fall through */
- case IP_CT_NEW:
- /* Seen it before? This can happen for loopback, retrans,
- * or local packets.
- */
- if (!nf_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- ret = do_chain(priv, skb, state, ct);
- if (ret != NF_ACCEPT)
- return ret;
-
- if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
- break;
-
- ret = nf_nat_alloc_null_binding(ct, state->hook);
- if (ret != NF_ACCEPT)
- return ret;
- } else {
- pr_debug("Already setup manip %s for ct %p\n",
- maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
- goto oif_changed;
- }
- break;
-
- default:
- /* ESTABLISHED */
- WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
- ctinfo != IP_CT_ESTABLISHED_REPLY);
- if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
- goto oif_changed;
}
- return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_DROP;
+ return nf_nat_inet_fn(priv, skb, state);
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
-unsigned int
+static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
unsigned int ret;
struct in6_addr daddr = ipv6_hdr(skb)->daddr;
- ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
skb_dst_drop(skb);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
-unsigned int
+static unsigned int
nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
@@ -373,7 +315,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
#endif
unsigned int ret;
- ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -393,22 +335,17 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
#endif
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
-unsigned int
+static unsigned int
nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state,
- unsigned int (*do_chain)(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct))
+ const struct nf_hook_state *state)
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
int err;
- ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+ ret = nf_nat_ipv6_fn(priv, skb, state);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -432,7 +369,49 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_out,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_local_fn,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_fn,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+};
+
+int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn);
+
+void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
static int __init nf_nat_l3proto_ipv6_init(void)
{
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 98f61fcb9108..e6eb7cf9b54f 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -10,7 +10,6 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/atomic.h>
#include <linux/netdevice.h>
#include <linux/ipv6.h>
@@ -26,14 +25,14 @@
static atomic_t v6_worker_count;
unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
const struct net_device *out)
{
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
struct in6_addr src;
struct nf_conn *ct;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
@@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void)
unregister_netdevice_notifier(&masq_dev_notifier);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 57593b00c5b4..d9bf42ba44fa 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
static void
icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
new file mode 100644
index 000000000000..bf1d6c421e3b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -0,0 +1,146 @@
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ indev = __in6_dev_get(skb->dev);
+ if (indev) {
+ read_lock_bh(&indev->lock);
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ read_unlock_bh(&indev->lock);
+ }
+
+ return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr6);
+
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ struct net *net,
+ const struct in6_addr *laddr,
+ const __be16 lport,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+ &iph->saddr,
+ nf_tproxy_laddr6(skb, laddr, &iph->daddr),
+ hp->source,
+ lport ? lport : hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+ const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+ struct tcphdr *tcph;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NF_TPROXY_LOOKUP_LISTENER:
+ tcph = hp;
+ sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+ thoff + __tcp_hdrlen(tcph),
+ saddr, sport,
+ daddr, ntohs(dport),
+ in->ifindex, 0);
+
+ if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NF_TPROXY_LOOKUP_ESTABLISHED:
+ sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, ntohs(dport),
+ in->ifindex, 0);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+ protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 3557b114446c..8a081ad7d5db 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -26,8 +26,7 @@
static unsigned int nft_nat_do_chain(void *priv,
struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct nf_conn *ct)
+ const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;
@@ -37,42 +36,14 @@ static unsigned int nft_nat_do_chain(void *priv,
return nft_do_chain(&pkt, priv);
}
-static unsigned int nft_nat_ipv6_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_out(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_local_fn(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static int nft_nat_ipv6_init(struct nft_ctx *ctx)
+static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
{
- return nf_ct_netns_get(ctx->net, ctx->family);
+ return nf_nat_l3proto_ipv6_register_fn(net, ops);
}
-static void nft_nat_ipv6_free(struct nft_ctx *ctx)
+static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
{
- nf_ct_netns_put(ctx->net, ctx->family);
+ nf_nat_l3proto_ipv6_unregister_fn(net, ops);
}
static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -85,13 +56,13 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in,
- [NF_INET_POST_ROUTING] = nft_nat_ipv6_out,
- [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn,
- [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn,
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
},
- .init = nft_nat_ipv6_init,
- .free = nft_nat_ipv6_free,
+ .ops_register = nft_nat_ipv6_reg,
+ .ops_unregister = nft_nat_ipv6_unreg,
};
static int __init nft_chain_nat_ipv6_init(void)
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 4146536e9c15..dd0122f3cffe 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index a27e424f690d..74269865acc8 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
const struct nft_pktinfo *pkt)
{
struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
if (priv->sreg_proto_min) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4979610287e2..b939b94e7e91 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -163,7 +163,8 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
}
static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
- struct frag_hdr *fhdr, int nhoff)
+ struct frag_hdr *fhdr, int nhoff,
+ u32 *prob_offset)
{
struct sk_buff *prev, *next;
struct net_device *dev;
@@ -179,11 +180,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
if ((unsigned int)end > IPV6_MAXPLEN) {
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
- ((u8 *)&fhdr->frag_off -
- skb_network_header(skb)));
+ *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
return -1;
}
@@ -214,10 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
/* RFC2460 says always send parameter problem in
* this case. -DaveM
*/
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
- offsetof(struct ipv6hdr, payload_len));
+ *prob_offset = offsetof(struct ipv6hdr, payload_len);
return -1;
}
if (end > fq->q.len) {
@@ -519,15 +513,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
+ u32 prob_offset = 0;
int ret;
spin_lock(&fq->q.lock);
fq->iif = iif;
- ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+ ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
+ &prob_offset);
spin_unlock(&fq->q.lock);
inet_frag_put(&fq->q);
+ if (prob_offset) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
+ }
return ret;
}
@@ -536,7 +537,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return -1;
fail_hdr:
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
IPSTATS_MIB_INHDRERRORS);
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
return -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a6598762d2c1..fb956989adaf 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,14 +63,20 @@
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
-#include <trace/events/fib6.h>
-
+#include <net/ip.h>
#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+static int ip6_rt_type_to_error(u8 fib6_type);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
+
enum rt6_nud_state {
RT6_NUD_FAIL_HARD = -3,
RT6_NUD_FAIL_PROBE = -2,
@@ -78,7 +84,6 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
@@ -97,25 +102,24 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
-static void rt6_dst_from_metrics_check(struct rt6_info *rt);
-static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
-static size_t rt6_nlmsg_size(struct rt6_info *rt);
-static int rt6_fill_node(struct net *net,
- struct sk_buff *skb, struct rt6_info *rt,
- struct in6_addr *dst, struct in6_addr *src,
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct fib6_info *rt);
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+ struct fib6_info *rt, struct dst_entry *dst,
+ struct in6_addr *dest, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
struct in6_addr *daddr,
struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev,
unsigned int pref);
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev);
@@ -184,29 +188,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
}
}
-static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
-{
- return dst_metrics_write_ptr(&rt->from->dst);
-}
-
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
- struct rt6_info *rt = (struct rt6_info *)dst;
-
- if (rt->rt6i_flags & RTF_PCPU)
- return rt6_pcpu_cow_metrics(rt);
- else if (rt->rt6i_flags & RTF_CACHE)
- return NULL;
- else
- return dst_cow_metrics_generic(dst, old);
-}
-
-static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+static inline const void *choose_neigh_daddr(const struct in6_addr *p,
struct sk_buff *skb,
const void *daddr)
{
- struct in6_addr *p = &rt->rt6i_gateway;
-
if (!ipv6_addr_any(p))
return (const void *) p;
else if (skb)
@@ -214,18 +199,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt,
return daddr;
}
-static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
- struct sk_buff *skb,
- const void *daddr)
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+ struct net_device *dev,
+ struct sk_buff *skb,
+ const void *daddr)
{
- struct rt6_info *rt = (struct rt6_info *) dst;
struct neighbour *n;
- daddr = choose_neigh_daddr(rt, skb, daddr);
- n = __ipv6_neigh_lookup(dst->dev, daddr);
+ daddr = choose_neigh_daddr(gw, skb, daddr);
+ n = __ipv6_neigh_lookup(dev, daddr);
if (n)
return n;
- return neigh_create(&nd_tbl, daddr, dst->dev);
+ return neigh_create(&nd_tbl, daddr, dev);
+}
+
+static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+
+ return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
}
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
@@ -233,7 +227,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
struct net_device *dev = dst->dev;
struct rt6_info *rt = (struct rt6_info *)dst;
- daddr = choose_neigh_daddr(rt, NULL, daddr);
+ daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -250,7 +244,7 @@ static struct dst_ops ip6_dst_ops_template = {
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.mtu = ip6_mtu,
- .cow_metrics = ipv6_cow_metrics,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
@@ -258,7 +252,7 @@ static struct dst_ops ip6_dst_ops_template = {
.update_pmtu = ip6_rt_update_pmtu,
.redirect = rt6_do_redirect,
.local_out = __ip6_local_out,
- .neigh_lookup = ip6_neigh_lookup,
+ .neigh_lookup = ip6_dst_neigh_lookup,
.confirm_neigh = ip6_confirm_neigh,
};
@@ -288,13 +282,22 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.redirect = ip6_rt_blackhole_redirect,
.cow_metrics = dst_cow_metrics_generic,
- .neigh_lookup = ip6_neigh_lookup,
+ .neigh_lookup = ip6_dst_neigh_lookup,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
[RTAX_HOPLIMIT - 1] = 0,
};
+static const struct fib6_info fib6_null_entry_template = {
+ .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .fib6_protocol = RTPROT_KERNEL,
+ .fib6_metric = ~(u32)0,
+ .fib6_ref = ATOMIC_INIT(1),
+ .fib6_type = RTN_UNREACHABLE,
+ .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
+};
+
static const struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
@@ -305,9 +308,6 @@ static const struct rt6_info ip6_null_entry_template = {
.output = ip6_pkt_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_protocol = RTPROT_KERNEL,
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -322,9 +322,6 @@ static const struct rt6_info ip6_prohibit_entry_template = {
.output = ip6_pkt_prohibit_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_protocol = RTPROT_KERNEL,
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -337,9 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
.output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
- .rt6i_protocol = RTPROT_KERNEL,
- .rt6i_metric = ~(u32) 0,
- .rt6i_ref = ATOMIC_INIT(1),
};
#endif
@@ -349,14 +343,12 @@ static void rt6_info_init(struct rt6_info *rt)
struct dst_entry *dst = &rt->dst;
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
- INIT_LIST_HEAD(&rt->rt6i_siblings);
INIT_LIST_HEAD(&rt->rt6i_uncached);
}
/* allocate dst with ip6_dst_ops */
-static struct rt6_info *__ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+ int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
1, DST_OBSOLETE_FORCE_CHK, flags);
@@ -368,34 +360,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
return rt;
}
-
-struct rt6_info *ip6_dst_alloc(struct net *net,
- struct net_device *dev,
- int flags)
-{
- struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
-
- if (rt) {
- rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
- if (!rt->rt6i_pcpu) {
- dst_release_immediate(&rt->dst);
- return NULL;
- }
- }
-
- return rt;
-}
EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
- struct rt6_exception_bucket *bucket;
- struct rt6_info *from = rt->from;
+ struct fib6_info *from;
struct inet6_dev *idev;
dst_destroy_metrics_generic(dst);
- free_percpu(rt->rt6i_pcpu);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
@@ -403,14 +376,12 @@ static void ip6_dst_destroy(struct dst_entry *dst)
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
- if (bucket) {
- rt->rt6i_exception_bucket = NULL;
- kfree(bucket);
- }
- rt->from = NULL;
- dst_release(&from->dst);
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+ rcu_assign_pointer(rt->from, NULL);
+ fib6_info_release(from);
+ rcu_read_unlock();
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -440,23 +411,27 @@ static bool __rt6_check_expired(const struct rt6_info *rt)
static bool rt6_check_expired(const struct rt6_info *rt)
{
+ struct fib6_info *from;
+
+ from = rcu_dereference(rt->from);
+
if (rt->rt6i_flags & RTF_EXPIRES) {
if (time_after(jiffies, rt->dst.expires))
return true;
- } else if (rt->from) {
+ } else if (from) {
return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
- rt6_check_expired(rt->from);
+ fib6_check_expired(from);
}
return false;
}
-static struct rt6_info *rt6_multipath_select(const struct net *net,
- struct rt6_info *match,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb,
- int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+ struct fib6_info *match,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb,
+ int strict)
{
- struct rt6_info *sibling, *next_sibling;
+ struct fib6_info *sibling, *next_sibling;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
@@ -464,12 +439,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
if (!fl6->mp_hash)
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
- if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
+ if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
return match;
- list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
- rt6i_siblings) {
- if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
+ list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
+ fib6_siblings) {
+ int nh_upper_bound;
+
+ nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
+ if (fl6->mp_hash > nh_upper_bound)
continue;
if (rt6_score_route(sibling, oif, strict) < 0)
break;
@@ -484,38 +462,27 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
* Route lookup. rcu_read_lock() should be held.
*/
-static inline struct rt6_info *rt6_device_match(struct net *net,
- struct rt6_info *rt,
+static inline struct fib6_info *rt6_device_match(struct net *net,
+ struct fib6_info *rt,
const struct in6_addr *saddr,
int oif,
int flags)
{
- struct rt6_info *local = NULL;
- struct rt6_info *sprt;
+ struct fib6_info *sprt;
- if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
+ if (!oif && ipv6_addr_any(saddr) &&
+ !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
return rt;
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
- struct net_device *dev = sprt->dst.dev;
+ for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
+ const struct net_device *dev = sprt->fib6_nh.nh_dev;
- if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
+ if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
continue;
if (oif) {
if (dev->ifindex == oif)
return sprt;
- if (dev->flags & IFF_LOOPBACK) {
- if (!sprt->rt6i_idev ||
- sprt->rt6i_idev->dev->ifindex != oif) {
- if (flags & RT6_LOOKUP_F_IFACE)
- continue;
- if (local &&
- local->rt6i_idev->dev->ifindex == oif)
- continue;
- }
- local = sprt;
- }
} else {
if (ipv6_chk_addr(net, saddr, dev,
flags & RT6_LOOKUP_F_IFACE))
@@ -523,15 +490,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
}
}
- if (oif) {
- if (local)
- return local;
-
- if (flags & RT6_LOOKUP_F_IFACE)
- return net->ipv6.ip6_null_entry;
- }
+ if (oif && flags & RT6_LOOKUP_F_IFACE)
+ return net->ipv6.fib6_null_entry;
- return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
+ return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -553,10 +515,13 @@ static void rt6_probe_deferred(struct work_struct *w)
kfree(work);
}
-static void rt6_probe(struct rt6_info *rt)
+static void rt6_probe(struct fib6_info *rt)
{
struct __rt6_probe_work *work;
+ const struct in6_addr *nh_gw;
struct neighbour *neigh;
+ struct net_device *dev;
+
/*
* Okay, this does not seem to be appropriate
* for now, however, we need to check if it
@@ -565,20 +530,25 @@ static void rt6_probe(struct rt6_info *rt)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+ if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
return;
+
+ nh_gw = &rt->fib6_nh.nh_gw;
+ dev = rt->fib6_nh.nh_dev;
rcu_read_lock_bh();
- neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+ neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
+ struct inet6_dev *idev;
+
if (neigh->nud_state & NUD_VALID)
goto out;
+ idev = __in6_dev_get(dev);
work = NULL;
write_lock(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
- neigh->updated +
- rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ neigh->updated + idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work)
__neigh_set_probe_once(neigh);
@@ -590,9 +560,9 @@ static void rt6_probe(struct rt6_info *rt)
if (work) {
INIT_WORK(&work->work, rt6_probe_deferred);
- work->target = rt->rt6i_gateway;
- dev_hold(rt->dst.dev);
- work->dev = rt->dst.dev;
+ work->target = *nh_gw;
+ dev_hold(dev);
+ work->dev = dev;
schedule_work(&work->work);
}
@@ -600,7 +570,7 @@ out:
rcu_read_unlock_bh();
}
#else
-static inline void rt6_probe(struct rt6_info *rt)
+static inline void rt6_probe(struct fib6_info *rt)
{
}
#endif
@@ -608,28 +578,27 @@ static inline void rt6_probe(struct rt6_info *rt)
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
-static inline int rt6_check_dev(struct rt6_info *rt, int oif)
+static inline int rt6_check_dev(struct fib6_info *rt, int oif)
{
- struct net_device *dev = rt->dst.dev;
+ const struct net_device *dev = rt->fib6_nh.nh_dev;
+
if (!oif || dev->ifindex == oif)
return 2;
- if ((dev->flags & IFF_LOOPBACK) &&
- rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
- return 1;
return 0;
}
-static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
+static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
{
- struct neighbour *neigh;
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
+ struct neighbour *neigh;
- if (rt->rt6i_flags & RTF_NONEXTHOP ||
- !(rt->rt6i_flags & RTF_GATEWAY))
+ if (rt->fib6_flags & RTF_NONEXTHOP ||
+ !(rt->fib6_flags & RTF_GATEWAY))
return RT6_NUD_SUCCEED;
rcu_read_lock_bh();
- neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+ neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
+ &rt->fib6_nh.nh_gw);
if (neigh) {
read_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
@@ -650,8 +619,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
return ret;
}
-static int rt6_score_route(struct rt6_info *rt, int oif,
- int strict)
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
{
int m;
@@ -659,7 +627,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
#endif
if (strict & RT6_LOOKUP_F_REACHABLE) {
int n = rt6_check_neigh(rt);
@@ -669,23 +637,37 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
return m;
}
-static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
- int *mpri, struct rt6_info *match,
+/* called with rc_read_lock held */
+static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
+{
+ const struct net_device *dev = fib6_info_nh_dev(f6i);
+ bool rc = false;
+
+ if (dev) {
+ const struct inet6_dev *idev = __in6_dev_get(dev);
+
+ rc = !!idev->cnf.ignore_routes_with_linkdown;
+ }
+
+ return rc;
+}
+
+static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
+ int *mpri, struct fib6_info *match,
bool *do_rr)
{
int m;
bool match_do_rr = false;
- struct inet6_dev *idev = rt->rt6i_idev;
- if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
goto out;
- if (idev->cnf.ignore_routes_with_linkdown &&
- rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+ if (fib6_ignore_linkdown(rt) &&
+ rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
- if (rt6_check_expired(rt))
+ if (fib6_check_expired(rt))
goto out;
m = rt6_score_route(rt, oif, strict);
@@ -709,19 +691,19 @@ out:
return match;
}
-static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
- struct rt6_info *leaf,
- struct rt6_info *rr_head,
+static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
+ struct fib6_info *leaf,
+ struct fib6_info *rr_head,
u32 metric, int oif, int strict,
bool *do_rr)
{
- struct rt6_info *rt, *match, *cont;
+ struct fib6_info *rt, *match, *cont;
int mpri = -1;
match = NULL;
cont = NULL;
- for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
- if (rt->rt6i_metric != metric) {
+ for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
+ if (rt->fib6_metric != metric) {
cont = rt;
break;
}
@@ -730,8 +712,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
}
for (rt = leaf; rt && rt != rr_head;
- rt = rcu_dereference(rt->rt6_next)) {
- if (rt->rt6i_metric != metric) {
+ rt = rcu_dereference(rt->fib6_next)) {
+ if (rt->fib6_metric != metric) {
cont = rt;
break;
}
@@ -742,22 +724,22 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
if (match || !cont)
return match;
- for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
+ for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
}
-static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
int oif, int strict)
{
- struct rt6_info *leaf = rcu_dereference(fn->leaf);
- struct rt6_info *match, *rt0;
+ struct fib6_info *leaf = rcu_dereference(fn->leaf);
+ struct fib6_info *match, *rt0;
bool do_rr = false;
int key_plen;
- if (!leaf || leaf == net->ipv6.ip6_null_entry)
- return net->ipv6.ip6_null_entry;
+ if (!leaf || leaf == net->ipv6.fib6_null_entry)
+ return net->ipv6.fib6_null_entry;
rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
@@ -768,39 +750,39 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
* (This might happen if all routes under fn are deleted from
* the tree and fib6_repair_tree() is called on the node.)
*/
- key_plen = rt0->rt6i_dst.plen;
+ key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
- if (rt0->rt6i_src.plen)
- key_plen = rt0->rt6i_src.plen;
+ if (rt0->fib6_src.plen)
+ key_plen = rt0->fib6_src.plen;
#endif
if (fn->fn_bit != key_plen)
- return net->ipv6.ip6_null_entry;
+ return net->ipv6.fib6_null_entry;
- match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
+ match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
&do_rr);
if (do_rr) {
- struct rt6_info *next = rcu_dereference(rt0->rt6_next);
+ struct fib6_info *next = rcu_dereference(rt0->fib6_next);
/* no entries matched; do round-robin */
- if (!next || next->rt6i_metric != rt0->rt6i_metric)
+ if (!next || next->fib6_metric != rt0->fib6_metric)
next = leaf;
if (next != rt0) {
- spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+ spin_lock_bh(&leaf->fib6_table->tb6_lock);
/* make sure next is not being deleted from the tree */
- if (next->rt6i_node)
+ if (next->fib6_node)
rcu_assign_pointer(fn->rr_ptr, next);
- spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+ spin_unlock_bh(&leaf->fib6_table->tb6_lock);
}
}
- return match ? match : net->ipv6.ip6_null_entry;
+ return match ? match : net->ipv6.fib6_null_entry;
}
-static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
{
- return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+ return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -812,7 +794,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
struct in6_addr prefix_buf, *prefix;
unsigned int pref;
unsigned long lifetime;
- struct rt6_info *rt;
+ struct fib6_info *rt;
if (len < sizeof(struct route_info)) {
return -EINVAL;
@@ -850,13 +832,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
}
if (rinfo->prefix_len == 0)
- rt = rt6_get_dflt_router(gwaddr, dev);
+ rt = rt6_get_dflt_router(net, gwaddr, dev);
else
rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
gwaddr, dev);
if (rt && !lifetime) {
- ip6_del_rt(rt);
+ ip6_del_rt(net, rt);
rt = NULL;
}
@@ -864,21 +846,162 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
dev, pref);
else if (rt)
- rt->rt6i_flags = RTF_ROUTEINFO |
- (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ rt->fib6_flags = RTF_ROUTEINFO |
+ (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
if (!addrconf_finite_timeout(lifetime))
- rt6_clean_expires(rt);
+ fib6_clean_expires(rt);
else
- rt6_set_expires(rt, jiffies + HZ * lifetime);
+ fib6_set_expires(rt, jiffies + HZ * lifetime);
- ip6_rt_put(rt);
+ fib6_info_release(rt);
}
return 0;
}
#endif
+/*
+ * Misc support functions
+ */
+
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
+{
+ struct net_device *dev = rt->fib6_nh.nh_dev;
+
+ if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+ /* for copies of local routes, dst->dev needs to be the
+ * device if it is a master device, the master device if
+ * device is enslaved, and the loopback as the default
+ */
+ if (netif_is_l3_slave(dev) &&
+ !rt6_need_strict(&rt->fib6_dst.addr))
+ dev = l3mdev_master_dev_rcu(dev);
+ else if (!netif_is_l3_master(dev))
+ dev = dev_net(dev)->loopback_dev;
+ /* last case is netif_is_l3_master(dev) is true in which
+ * case we want dev returned to be dev
+ */
+ }
+
+ return dev;
+}
+
+static const int fib6_prop[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = 0,
+ [RTN_UNICAST] = 0,
+ [RTN_LOCAL] = 0,
+ [RTN_BROADCAST] = 0,
+ [RTN_ANYCAST] = 0,
+ [RTN_MULTICAST] = 0,
+ [RTN_BLACKHOLE] = -EINVAL,
+ [RTN_UNREACHABLE] = -EHOSTUNREACH,
+ [RTN_PROHIBIT] = -EACCES,
+ [RTN_THROW] = -EAGAIN,
+ [RTN_NAT] = -EINVAL,
+ [RTN_XRESOLVE] = -EINVAL,
+};
+
+static int ip6_rt_type_to_error(u8 fib6_type)
+{
+ return fib6_prop[fib6_type];
+}
+
+static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
+{
+ unsigned short flags = 0;
+
+ if (rt->dst_nocount)
+ flags |= DST_NOCOUNT;
+ if (rt->dst_nopolicy)
+ flags |= DST_NOPOLICY;
+ if (rt->dst_host)
+ flags |= DST_HOST;
+
+ return flags;
+}
+
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
+{
+ rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
+
+ switch (ort->fib6_type) {
+ case RTN_BLACKHOLE:
+ rt->dst.output = dst_discard_out;
+ rt->dst.input = dst_discard;
+ break;
+ case RTN_PROHIBIT:
+ rt->dst.output = ip6_pkt_prohibit_out;
+ rt->dst.input = ip6_pkt_prohibit;
+ break;
+ case RTN_THROW:
+ case RTN_UNREACHABLE:
+ default:
+ rt->dst.output = ip6_pkt_discard_out;
+ rt->dst.input = ip6_pkt_discard;
+ break;
+ }
+}
+
+static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
+{
+ rt->dst.flags |= fib6_info_dst_flags(ort);
+
+ if (ort->fib6_flags & RTF_REJECT) {
+ ip6_rt_init_dst_reject(rt, ort);
+ return;
+ }
+
+ rt->dst.error = 0;
+ rt->dst.output = ip6_output;
+
+ if (ort->fib6_type == RTN_LOCAL) {
+ rt->dst.input = ip6_input;
+ } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
+ rt->dst.input = ip6_mc_input;
+ } else {
+ rt->dst.input = ip6_forward;
+ }
+
+ if (ort->fib6_nh.nh_lwtstate) {
+ rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+ lwtunnel_set_redirect(&rt->dst);
+ }
+
+ rt->dst.lastuse = jiffies;
+}
+
+static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
+{
+ rt->rt6i_flags &= ~RTF_EXPIRES;
+ fib6_info_hold(from);
+ rcu_assign_pointer(rt->from, from);
+ dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
+ if (from->fib6_metrics != &dst_default_metrics) {
+ rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+ refcount_inc(&from->fib6_metrics->refcnt);
+ }
+}
+
+static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
+{
+ struct net_device *dev = fib6_info_nh_dev(ort);
+
+ ip6_rt_init_dst(rt, ort);
+
+ rt->rt6i_dst = ort->fib6_dst;
+ rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
+ rt->rt6i_gateway = ort->fib6_nh.nh_gw;
+ rt->rt6i_flags = ort->fib6_flags;
+ rt6_set_from(rt, ort);
+#ifdef CONFIG_IPV6_SUBTREES
+ rt->rt6i_src = ort->fib6_src;
+#endif
+ rt->rt6i_prefsrc = ort->fib6_prefsrc;
+ rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+}
+
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
{
@@ -889,7 +1012,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
pn = rcu_dereference(fn->parent);
sn = FIB6_SUBTREE(pn);
if (sn && sn != fn)
- fn = fib6_lookup(sn, NULL, saddr);
+ fn = fib6_node_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
@@ -914,50 +1037,74 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
return false;
}
+/* called with rcu_lock held */
+static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
+{
+ unsigned short flags = fib6_info_dst_flags(rt);
+ struct net_device *dev = rt->fib6_nh.nh_dev;
+ struct rt6_info *nrt;
+
+ nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
+ if (nrt)
+ ip6_rt_copy_init(nrt, rt);
+
+ return nrt;
+}
+
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
- struct rt6_info *rt, *rt_cache;
+ struct fib6_info *f6i;
struct fib6_node *fn;
+ struct rt6_info *rt;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
flags &= ~RT6_LOOKUP_F_IFACE;
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- rt = rcu_dereference(fn->leaf);
- if (!rt) {
- rt = net->ipv6.ip6_null_entry;
+ f6i = rcu_dereference(fn->leaf);
+ if (!f6i) {
+ f6i = net->ipv6.fib6_null_entry;
} else {
- rt = rt6_device_match(net, rt, &fl6->saddr,
+ f6i = rt6_device_match(net, f6i, &fl6->saddr,
fl6->flowi6_oif, flags);
- if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
- rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
- skb, flags);
+ if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
+ f6i = fib6_multipath_select(net, f6i, fl6,
+ fl6->flowi6_oif, skb,
+ flags);
}
- if (rt == net->ipv6.ip6_null_entry) {
+ if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
}
- /* Search through exception table */
- rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
- if (rt_cache)
- rt = rt_cache;
- if (ip6_hold_safe(net, &rt, true))
- dst_use_noref(&rt->dst, jiffies);
+ trace_fib6_table_lookup(net, f6i, table, fl6);
- rcu_read_unlock();
+ /* Search through exception table */
+ rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ if (rt) {
+ if (ip6_hold_safe(net, &rt, true))
+ dst_use_noref(&rt->dst, jiffies);
+ } else if (f6i == net->ipv6.fib6_null_entry) {
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ } else {
+ rt = ip6_create_rt_rcu(f6i);
+ if (!rt) {
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ }
+ }
- trace_fib6_table_lookup(net, rt, table, fl6);
+ rcu_read_unlock();
return rt;
-
}
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
@@ -999,55 +1146,28 @@ EXPORT_SYMBOL(rt6_lookup);
* Caller must hold dst before calling it.
*/
-static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
- struct mx6_config *mxc,
+static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
struct netlink_ext_ack *extack)
{
int err;
struct fib6_table *table;
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
- err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
+ err = fib6_add(&table->tb6_root, rt, info, extack);
spin_unlock_bh(&table->tb6_lock);
return err;
}
-int ip6_ins_rt(struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
- struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
- struct mx6_config mxc = { .mx = NULL, };
+ struct nl_info info = { .nl_net = net, };
- /* Hold dst to account for the reference from the fib6 tree */
- dst_hold(&rt->dst);
- return __ip6_ins_rt(rt, &info, &mxc, NULL);
+ return __ip6_ins_rt(rt, &info, NULL);
}
-/* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
-{
- struct net_device *dev = rt->dst.dev;
-
- if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
- /* for copies of local routes, dst->dev needs to be the
- * device if it is a master device, the master device if
- * device is enslaved, and the loopback as the default
- */
- if (netif_is_l3_slave(dev) &&
- !rt6_need_strict(&rt->rt6i_dst.addr))
- dev = l3mdev_master_dev_rcu(dev);
- else if (!netif_is_l3_master(dev))
- dev = dev_net(dev)->loopback_dev;
- /* last case is netif_is_l3_master(dev) is true in which
- * case we want dev returned to be dev
- */
- }
-
- return dev;
-}
-
-static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
@@ -1058,26 +1178,20 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
* Clone the route.
*/
- if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
- ort = ort->from;
-
- rcu_read_lock();
dev = ip6_rt_get_dev_rcu(ort);
- rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
- rcu_read_unlock();
+ rt = ip6_dst_alloc(dev_net(dev), dev, 0);
if (!rt)
return NULL;
ip6_rt_copy_init(rt, ort);
rt->rt6i_flags |= RTF_CACHE;
- rt->rt6i_metric = 0;
rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
if (!rt6_is_gw_or_nonexthop(ort)) {
- if (ort->rt6i_dst.plen != 128 &&
- ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+ if (ort->fib6_dst.plen != 128 &&
+ ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
@@ -1090,45 +1204,44 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
return rt;
}
-static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
{
+ unsigned short flags = fib6_info_dst_flags(rt);
struct net_device *dev;
struct rt6_info *pcpu_rt;
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(rt);
- pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+ pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
rcu_read_unlock();
if (!pcpu_rt)
return NULL;
ip6_rt_copy_init(pcpu_rt, rt);
- pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
pcpu_rt->rt6i_flags |= RTF_PCPU;
return pcpu_rt;
}
/* It should be called with rcu_read_lock() acquired */
-static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
{
struct rt6_info *pcpu_rt, **p;
p = this_cpu_ptr(rt->rt6i_pcpu);
pcpu_rt = *p;
- if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
- rt6_dst_from_metrics_check(pcpu_rt);
+ if (pcpu_rt)
+ ip6_hold_safe(NULL, &pcpu_rt, false);
return pcpu_rt;
}
-static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_make_pcpu_route(struct net *net,
+ struct fib6_info *rt)
{
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(rt);
if (!pcpu_rt) {
- struct net *net = dev_net(rt->dst.dev);
-
dst_hold(&net->ipv6.ip6_null_entry->dst);
return net->ipv6.ip6_null_entry;
}
@@ -1138,7 +1251,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
- rt6_dst_from_metrics_check(pcpu_rt);
return pcpu_rt;
}
@@ -1158,9 +1270,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
return;
net = dev_net(rt6_ex->rt6i->dst.dev);
- rt6_ex->rt6i->rt6i_node = NULL;
hlist_del_rcu(&rt6_ex->hlist);
- rt6_release(rt6_ex->rt6i);
+ dst_release(&rt6_ex->rt6i->dst);
kfree_rcu(rt6_ex, rcu);
WARN_ON_ONCE(!bucket->depth);
bucket->depth--;
@@ -1268,20 +1379,36 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
return NULL;
}
+static unsigned int fib6_mtu(const struct fib6_info *rt)
+{
+ unsigned int mtu;
+
+ if (rt->fib6_pmtu) {
+ mtu = rt->fib6_pmtu;
+ } else {
+ struct net_device *dev = fib6_info_nh_dev(rt);
+ struct inet6_dev *idev;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ mtu = idev->cnf.mtu6;
+ rcu_read_unlock();
+ }
+
+ mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
+}
+
static int rt6_insert_exception(struct rt6_info *nrt,
- struct rt6_info *ort)
+ struct fib6_info *ort)
{
- struct net *net = dev_net(ort->dst.dev);
+ struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
int err = 0;
- /* ort can't be a cache or pcpu route */
- if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
- ort = ort->from;
- WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
-
spin_lock_bh(&rt6_exception_lock);
if (ort->exception_bucket_flushed) {
@@ -1308,19 +1435,19 @@ static int rt6_insert_exception(struct rt6_info *nrt,
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (ort->rt6i_src.plen)
+ if (ort->fib6_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
/* Update rt6i_prefsrc as it could be changed
* in rt6_remove_prefsrc()
*/
- nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+ nrt->rt6i_prefsrc = ort->fib6_prefsrc;
/* rt6_mtu_change() might lower mtu on ort.
* Only insert this exception route if its mtu
* is less than ort's mtu value.
*/
- if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
err = -EINVAL;
goto out;
}
@@ -1337,8 +1464,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
}
rt6_ex->rt6i = nrt;
rt6_ex->stamp = jiffies;
- atomic_inc(&nrt->rt6i_ref);
- nrt->rt6i_node = ort->rt6i_node;
hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
bucket->depth++;
net->ipv6.rt6_stats->fib_rt_cache++;
@@ -1351,16 +1476,16 @@ out:
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
- spin_lock_bh(&ort->rt6i_table->tb6_lock);
- fib6_update_sernum(ort);
- spin_unlock_bh(&ort->rt6i_table->tb6_lock);
+ spin_lock_bh(&ort->fib6_table->tb6_lock);
+ fib6_update_sernum(net, ort);
+ spin_unlock_bh(&ort->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
return err;
}
-void rt6_flush_exceptions(struct rt6_info *rt)
+void rt6_flush_exceptions(struct fib6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1390,7 +1515,7 @@ out:
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
struct in6_addr *daddr,
struct in6_addr *saddr)
{
@@ -1408,7 +1533,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (rt->rt6i_src.plen)
+ if (rt->fib6_src.plen)
src_key = saddr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
@@ -1420,14 +1545,15 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
}
/* Remove the passed in cached rt from the hash table that contains it */
-int rt6_remove_exception_rt(struct rt6_info *rt)
+static int rt6_remove_exception_rt(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
- struct rt6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
+ struct fib6_info *from;
int err;
+ from = rcu_dereference(rt->from);
if (!from ||
!(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
@@ -1445,7 +1571,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->rt6i_src.plen)
+ if (from->fib6_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1468,7 +1594,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
- struct rt6_info *from = rt->from;
+ struct fib6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
@@ -1486,7 +1612,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->rt6i_src.plen)
+ if (from->fib6_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket,
@@ -1498,7 +1624,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
rcu_read_unlock();
}
-static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1540,7 +1666,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
- struct rt6_info *rt, int mtu)
+ struct fib6_info *rt, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1557,12 +1683,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
struct rt6_info *entry = rt6_ex->rt6i;
/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
- * route), the metrics of its rt->dst.from have already
+ * route), the metrics of its rt->from have already
* been updated.
*/
- if (entry->rt6i_pmtu &&
+ if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
rt6_mtu_change_route_allowed(idev, entry, mtu))
- entry->rt6i_pmtu = mtu;
+ dst_metric_set(&entry->dst, RTAX_MTU, mtu);
}
bucket++;
}
@@ -1570,7 +1696,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
-static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
@@ -1649,7 +1775,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
-void rt6_age_exceptions(struct rt6_info *rt,
+void rt6_age_exceptions(struct fib6_info *rt,
struct fib6_gc_args *gc_args,
unsigned long now)
{
@@ -1680,32 +1806,22 @@ void rt6_age_exceptions(struct rt6_info *rt,
rcu_read_unlock_bh();
}
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6,
- const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int strict)
{
struct fib6_node *fn, *saved_fn;
- struct rt6_info *rt, *rt_cache;
- int strict = 0;
+ struct fib6_info *f6i;
- strict |= flags & RT6_LOOKUP_F_IFACE;
- strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
- if (net->ipv6.devconf_all->forwarding == 0)
- strict |= RT6_LOOKUP_F_REACHABLE;
-
- rcu_read_lock();
-
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
oif = 0;
redo_rt6_select:
- rt = rt6_select(net, fn, oif, strict);
- if (rt->rt6i_nsiblings)
- rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
- if (rt == net->ipv6.ip6_null_entry) {
+ f6i = rt6_select(net, fn, oif, strict);
+ if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto redo_rt6_select;
@@ -1717,45 +1833,57 @@ redo_rt6_select:
}
}
- /*Search through exception table */
- rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
- if (rt_cache)
- rt = rt_cache;
+ trace_fib6_table_lookup(net, f6i, table, fl6);
- if (rt == net->ipv6.ip6_null_entry) {
+ return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ const struct sk_buff *skb, int flags)
+{
+ struct fib6_info *f6i;
+ struct rt6_info *rt;
+ int strict = 0;
+
+ strict |= flags & RT6_LOOKUP_F_IFACE;
+ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+ if (net->ipv6.devconf_all->forwarding == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
+
+ rcu_read_lock();
+
+ f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+ if (f6i->fib6_nsiblings)
+ f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
+ if (f6i == net->ipv6.fib6_null_entry) {
+ rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
dst_hold(&rt->dst);
- trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
- } else if (rt->rt6i_flags & RTF_CACHE) {
- if (ip6_hold_safe(net, &rt, true)) {
+ }
+
+ /*Search through exception table */
+ rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ if (rt) {
+ if (ip6_hold_safe(net, &rt, true))
dst_use_noref(&rt->dst, jiffies);
- rt6_dst_from_metrics_check(rt);
- }
+
rcu_read_unlock();
- trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
- !(rt->rt6i_flags & RTF_GATEWAY))) {
+ !(f6i->fib6_flags & RTF_GATEWAY))) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
-
struct rt6_info *uncached_rt;
- if (ip6_hold_safe(net, &rt, true)) {
- dst_use_noref(&rt->dst, jiffies);
- } else {
- rcu_read_unlock();
- uncached_rt = rt;
- goto uncached_rt_out;
- }
- rcu_read_unlock();
+ uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
- uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
- dst_release(&rt->dst);
+ rcu_read_unlock();
if (uncached_rt) {
/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -1768,36 +1896,21 @@ redo_rt6_select:
dst_hold(&uncached_rt->dst);
}
-uncached_rt_out:
- trace_fib6_table_lookup(net, uncached_rt, table, fl6);
return uncached_rt;
-
} else {
/* Get a percpu copy */
struct rt6_info *pcpu_rt;
- dst_use_noref(&rt->dst, jiffies);
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(rt);
-
- if (!pcpu_rt) {
- /* atomic_inc_not_zero() is needed when using rcu */
- if (atomic_inc_not_zero(&rt->rt6i_ref)) {
- /* No dst_hold() on rt is needed because grabbing
- * rt->rt6i_ref makes sure rt can't be released.
- */
- pcpu_rt = rt6_make_pcpu_route(rt);
- rt6_release(rt);
- } else {
- /* rt is already removed from tree */
- pcpu_rt = net->ipv6.ip6_null_entry;
- dst_hold(&pcpu_rt->dst);
- }
- }
+ pcpu_rt = rt6_get_pcpu_route(f6i);
+
+ if (!pcpu_rt)
+ pcpu_rt = rt6_make_pcpu_route(net, f6i);
+
local_bh_enable();
rcu_read_unlock();
- trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
return pcpu_rt;
}
}
@@ -1868,7 +1981,7 @@ out:
} else {
keys->addrs.v6addrs.src = key_iph->saddr;
keys->addrs.v6addrs.dst = key_iph->daddr;
- keys->tags.flow_label = ip6_flowinfo(key_iph);
+ keys->tags.flow_label = ip6_flowlabel(key_iph);
keys->basic.ip_proto = key_iph->nexthdr;
}
}
@@ -1889,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
} else {
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
- hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
break;
@@ -2020,7 +2133,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
rt->rt6i_idev = in6_dev_get(loopback_dev);
rt->rt6i_gateway = ort->rt6i_gateway;
rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
- rt->rt6i_metric = 0;
memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
@@ -2036,18 +2148,27 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
* Destination cache support functions
*/
-static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
- if (rt->from &&
- dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
- dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
+ u32 rt_cookie = 0;
+
+ if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
+ return false;
+
+ if (fib6_check_expired(f6i))
+ return false;
+
+ return true;
}
-static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
{
u32 rt_cookie = 0;
- if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
+ if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
+ rt_cookie != cookie)
return NULL;
if (rt6_check_expired(rt))
@@ -2056,11 +2177,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
return &rt->dst;
}
-static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
+ struct fib6_info *from,
+ u32 cookie)
{
if (!__rt6_check_expired(rt) &&
rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
- rt6_check(rt->from, cookie))
+ fib6_check(from, cookie))
return &rt->dst;
else
return NULL;
@@ -2068,22 +2191,30 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
+ struct dst_entry *dst_ret;
+ struct fib6_info *from;
struct rt6_info *rt;
- rt = (struct rt6_info *) dst;
+ rt = container_of(dst, struct rt6_info, dst);
+
+ rcu_read_lock();
/* All IPV6 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
- rt6_dst_from_metrics_check(rt);
+ from = rcu_dereference(rt->from);
- if (rt->rt6i_flags & RTF_PCPU ||
- (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
- return rt6_dst_from_check(rt, cookie);
+ if (from && (rt->rt6i_flags & RTF_PCPU ||
+ unlikely(!list_empty(&rt->rt6i_uncached))))
+ dst_ret = rt6_dst_from_check(rt, from, cookie);
else
- return rt6_check(rt, cookie);
+ dst_ret = rt6_check(rt, from, cookie);
+
+ rcu_read_unlock();
+
+ return dst_ret;
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -2092,10 +2223,12 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
+ rcu_read_lock();
if (rt6_check_expired(rt)) {
- ip6_del_rt(rt);
+ rt6_remove_exception_rt(rt);
dst = NULL;
}
+ rcu_read_unlock();
} else {
dst_release(dst);
dst = NULL;
@@ -2112,35 +2245,60 @@ static void ip6_link_failure(struct sk_buff *skb)
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
+ rcu_read_lock();
if (rt->rt6i_flags & RTF_CACHE) {
if (dst_hold_safe(&rt->dst))
- ip6_del_rt(rt);
+ rt6_remove_exception_rt(rt);
} else {
+ struct fib6_info *from;
struct fib6_node *fn;
- rcu_read_lock();
- fn = rcu_dereference(rt->rt6i_node);
- if (fn && (rt->rt6i_flags & RTF_DEFAULT))
- fn->fn_sernum = -1;
- rcu_read_unlock();
+ from = rcu_dereference(rt->from);
+ if (from) {
+ fn = rcu_dereference(from->fib6_node);
+ if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+ fn->fn_sernum = -1;
+ }
}
+ rcu_read_unlock();
}
}
+static void rt6_update_expires(struct rt6_info *rt0, int timeout)
+{
+ if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
+ struct fib6_info *from;
+
+ rcu_read_lock();
+ from = rcu_dereference(rt0->from);
+ if (from)
+ rt0->dst.expires = from->expires;
+ rcu_read_unlock();
+ }
+
+ dst_set_expires(&rt0->dst, timeout);
+ rt0->rt6i_flags |= RTF_EXPIRES;
+}
+
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
struct net *net = dev_net(rt->dst.dev);
+ dst_metric_set(&rt->dst, RTAX_MTU, mtu);
rt->rt6i_flags |= RTF_MODIFIED;
- rt->rt6i_pmtu = mtu;
rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
+ bool from_set;
+
+ rcu_read_lock();
+ from_set = !!rcu_dereference(rt->from);
+ rcu_read_unlock();
+
return !(rt->rt6i_flags & RTF_CACHE) &&
- (rt->rt6i_flags & RTF_PCPU ||
- rcu_access_pointer(rt->rt6i_node));
+ (rt->rt6i_flags & RTF_PCPU || from_set);
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -2176,14 +2334,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
+ struct fib6_info *from;
struct rt6_info *nrt6;
- nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+ rcu_read_lock();
+ from = rcu_dereference(rt6->from);
+ nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
- if (rt6_insert_exception(nrt6, rt6))
+ if (rt6_insert_exception(nrt6, from))
dst_release_immediate(&nrt6->dst);
}
+ rcu_read_unlock();
}
}
@@ -2264,7 +2426,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
- struct rt6_info *rt, *rt_cache;
+ struct rt6_info *ret = NULL, *rt_cache;
+ struct fib6_info *rt;
struct fib6_node *fn;
/* Get the "current" route for this destination and
@@ -2278,32 +2441,32 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
*/
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
- if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
continue;
- if (rt6_check_expired(rt))
+ if (fib6_check_expired(rt))
continue;
- if (rt->dst.error)
+ if (rt->fib6_flags & RTF_REJECT)
break;
- if (!(rt->rt6i_flags & RTF_GATEWAY))
+ if (!(rt->fib6_flags & RTF_GATEWAY))
continue;
- if (fl6->flowi6_oif != rt->dst.dev->ifindex)
+ if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
continue;
/* rt_cache's gateway might be different from its 'parent'
* in the case of an ip redirect.
* So we keep searching in the exception table if the gateway
* is different.
*/
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+ if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
rt_cache = rt6_find_cached_rt(rt,
&fl6->daddr,
&fl6->saddr);
if (rt_cache &&
ipv6_addr_equal(&rdfl->gateway,
&rt_cache->rt6i_gateway)) {
- rt = rt_cache;
+ ret = rt_cache;
break;
}
continue;
@@ -2312,25 +2475,28 @@ restart:
}
if (!rt)
- rt = net->ipv6.ip6_null_entry;
- else if (rt->dst.error) {
- rt = net->ipv6.ip6_null_entry;
+ rt = net->ipv6.fib6_null_entry;
+ else if (rt->fib6_flags & RTF_REJECT) {
+ ret = net->ipv6.ip6_null_entry;
goto out;
}
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
}
out:
- ip6_hold_safe(net, &rt, true);
+ if (ret)
+ dst_hold(&ret->dst);
+ else
+ ret = ip6_create_rt_rcu(rt);
rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table, fl6);
- return rt;
+ return ret;
};
static struct dst_entry *ip6_route_redirect(struct net *net,
@@ -2422,12 +2588,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
static unsigned int ip6_mtu(const struct dst_entry *dst)
{
- const struct rt6_info *rt = (const struct rt6_info *)dst;
- unsigned int mtu = rt->rt6i_pmtu;
struct inet6_dev *idev;
-
- if (mtu)
- goto out;
+ unsigned int mtu;
mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
@@ -2447,6 +2609,54 @@ out:
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ *
+ * based on ip6_dst_mtu_forward and exception logic of
+ * rt6_find_cached_rt; called with rcu_read_lock
+ */
+u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+ struct in6_addr *saddr)
+{
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ struct in6_addr *src_key;
+ struct inet6_dev *idev;
+ u32 mtu = 0;
+
+ if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
+ mtu = f6i->fib6_pmtu;
+ if (mtu)
+ goto out;
+ }
+
+ src_key = NULL;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (f6i->fib6_src.plen)
+ src_key = saddr;
+#endif
+
+ bucket = rcu_dereference(f6i->rt6i_exception_bucket);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+ if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+ mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
+
+ if (likely(!mtu)) {
+ struct net_device *dev = fib6_info_nh_dev(f6i);
+
+ mtu = IPV6_MIN_MTU;
+ idev = __in6_dev_get(dev);
+ if (idev && idev->cnf.mtu6 > mtu)
+ mtu = idev->cnf.mtu6;
+ }
+
+ mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+out:
+ return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
+}
+
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
struct flowi6 *fl6)
{
@@ -2511,60 +2721,22 @@ out:
return entries > rt_max_size;
}
-static int ip6_convert_metrics(struct mx6_config *mxc,
- const struct fib6_config *cfg)
+static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
+ struct fib6_config *cfg)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
- bool ecn_ca = false;
- struct nlattr *nla;
- int remaining;
- u32 *mp;
+ struct dst_metrics *p;
if (!cfg->fc_mx)
return 0;
- mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
- if (unlikely(!mp))
+ p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
+ if (unlikely(!p))
return -ENOMEM;
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
- u32 val;
-
- if (!type)
- continue;
- if (unlikely(type > RTAX_MAX))
- goto err;
-
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
- goto err;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_HOPLIMIT && val > 255)
- val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
- goto err;
-
- mp[type - 1] = val;
- __set_bit(type - 1, mxc->mx_valid);
- }
-
- if (ecn_ca) {
- __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
- mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
- }
+ refcount_set(&p->refcnt, 1);
+ rt->fib6_metrics = p;
- mxc->mx = mp;
- return 0;
- err:
- kfree(mp);
- return -EINVAL;
+ return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
}
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
@@ -2750,11 +2922,12 @@ out:
return err;
}
-static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
+static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
+ gfp_t gfp_flags,
struct netlink_ext_ack *extack)
{
struct net *net = cfg->fc_nlinfo.nl_net;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
struct fib6_table *table;
@@ -2773,6 +2946,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
+ if (cfg->fc_type > RTN_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ goto out;
+ }
+
if (cfg->fc_dst_len > 128) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
goto out;
@@ -2831,35 +3009,30 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!table)
goto out;
- rt = ip6_dst_alloc(net, NULL,
- (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
+ err = -ENOMEM;
+ rt = fib6_info_alloc(gfp_flags);
+ if (!rt)
+ goto out;
+
+ if (cfg->fc_flags & RTF_ADDRCONF)
+ rt->dst_nocount = true;
- if (!rt) {
- err = -ENOMEM;
+ err = ip6_convert_metrics(net, rt, cfg);
+ if (err < 0)
goto out;
- }
if (cfg->fc_flags & RTF_EXPIRES)
- rt6_set_expires(rt, jiffies +
+ fib6_set_expires(rt, jiffies +
clock_t_to_jiffies(cfg->fc_expires));
else
- rt6_clean_expires(rt);
+ fib6_clean_expires(rt);
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
- rt->rt6i_protocol = cfg->fc_protocol;
+ rt->fib6_protocol = cfg->fc_protocol;
addr_type = ipv6_addr_type(&cfg->fc_dst);
- if (addr_type & IPV6_ADDR_MULTICAST)
- rt->dst.input = ip6_mc_input;
- else if (cfg->fc_flags & RTF_LOCAL)
- rt->dst.input = ip6_input;
- else
- rt->dst.input = ip6_forward;
-
- rt->dst.output = ip6_output;
-
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
@@ -2868,22 +3041,23 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
&lwtstate, extack);
if (err)
goto out;
- rt->dst.lwtstate = lwtstate_get(lwtstate);
- lwtunnel_set_redirect(&rt->dst);
+ rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
}
- ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
- rt->rt6i_dst.plen = cfg->fc_dst_len;
- if (rt->rt6i_dst.plen == 128)
- rt->dst.flags |= DST_HOST;
+ ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+ rt->fib6_dst.plen = cfg->fc_dst_len;
+ if (rt->fib6_dst.plen == 128)
+ rt->dst_host = true;
#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
- rt->rt6i_src.plen = cfg->fc_src_len;
+ ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
+ rt->fib6_src.plen = cfg->fc_src_len;
#endif
- rt->rt6i_metric = cfg->fc_metric;
- rt->rt6i_nh_weight = 1;
+ rt->fib6_metric = cfg->fc_metric;
+ rt->fib6_nh.nh_weight = 1;
+
+ rt->fib6_type = cfg->fc_type;
/* We cannot add true routes via loopback here,
they would result in kernel looping; promote them to reject routes
@@ -2906,28 +3080,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
}
}
- rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
- switch (cfg->fc_type) {
- case RTN_BLACKHOLE:
- rt->dst.error = -EINVAL;
- rt->dst.output = dst_discard_out;
- rt->dst.input = dst_discard;
- break;
- case RTN_PROHIBIT:
- rt->dst.error = -EACCES;
- rt->dst.output = ip6_pkt_prohibit_out;
- rt->dst.input = ip6_pkt_prohibit;
- break;
- case RTN_THROW:
- case RTN_UNREACHABLE:
- default:
- rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
- : (cfg->fc_type == RTN_UNREACHABLE)
- ? -EHOSTUNREACH : -ENETUNREACH;
- rt->dst.output = ip6_pkt_discard_out;
- rt->dst.input = ip6_pkt_discard;
- break;
- }
+ rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
goto install_route;
}
@@ -2936,7 +3089,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
if (err)
goto out;
- rt->rt6i_gateway = cfg->fc_gateway;
+ rt->fib6_nh.nh_gw = cfg->fc_gateway;
}
err = -ENODEV;
@@ -2961,96 +3114,82 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
err = -EINVAL;
goto out;
}
- rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
- rt->rt6i_prefsrc.plen = 128;
+ rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
+ rt->fib6_prefsrc.plen = 128;
} else
- rt->rt6i_prefsrc.plen = 0;
+ rt->fib6_prefsrc.plen = 0;
- rt->rt6i_flags = cfg->fc_flags;
+ rt->fib6_flags = cfg->fc_flags;
install_route:
- if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+ if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
!netif_carrier_ok(dev))
- rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
- rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
- rt->dst.dev = dev;
- rt->rt6i_idev = idev;
- rt->rt6i_table = table;
+ rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
+ rt->fib6_nh.nh_dev = dev;
+ rt->fib6_table = table;
cfg->fc_nlinfo.nl_net = dev_net(dev);
+ if (idev)
+ in6_dev_put(idev);
+
return rt;
out:
if (dev)
dev_put(dev);
if (idev)
in6_dev_put(idev);
- if (rt)
- dst_release_immediate(&rt->dst);
+ fib6_info_release(rt);
return ERR_PTR(err);
}
-int ip6_route_add(struct fib6_config *cfg,
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack)
{
- struct mx6_config mxc = { .mx = NULL, };
- struct rt6_info *rt;
+ struct fib6_info *rt;
int err;
- rt = ip6_route_info_create(cfg, extack);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
- goto out;
- }
-
- err = ip6_convert_metrics(&mxc, cfg);
- if (err)
- goto out;
-
- err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
-
- kfree(mxc.mx);
+ rt = ip6_route_info_create(cfg, gfp_flags, extack);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
- return err;
-out:
- if (rt)
- dst_release_immediate(&rt->dst);
+ err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
+ fib6_info_release(rt);
return err;
}
-static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
- int err;
+ struct net *net = info->nl_net;
struct fib6_table *table;
- struct net *net = dev_net(rt->dst.dev);
+ int err;
- if (rt == net->ipv6.ip6_null_entry) {
+ if (rt == net->ipv6.fib6_null_entry) {
err = -ENOENT;
goto out;
}
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_del(rt, info);
spin_unlock_bh(&table->tb6_lock);
out:
- ip6_rt_put(rt);
+ fib6_info_release(rt);
return err;
}
-int ip6_del_rt(struct rt6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt)
{
- struct nl_info info = {
- .nl_net = dev_net(rt->dst.dev),
- };
+ struct nl_info info = { .nl_net = net };
+
return __ip6_del_rt(rt, &info);
}
-static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
struct nl_info *info = &cfg->fc_nlinfo;
struct net *net = info->nl_net;
@@ -3058,20 +3197,20 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
struct fib6_table *table;
int err = -ENOENT;
- if (rt == net->ipv6.ip6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry)
goto out_put;
- table = rt->rt6i_table;
+ table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
- if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
- struct rt6_info *sibling, *next_sibling;
+ if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
+ struct fib6_info *sibling, *next_sibling;
/* prefer to send a single notification with all hops */
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
if (skb) {
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- if (rt6_fill_node(net, skb, rt,
+ if (rt6_fill_node(net, skb, rt, NULL,
NULL, NULL, 0, RTM_DELROUTE,
info->portid, seq, 0) < 0) {
kfree_skb(skb);
@@ -3081,8 +3220,8 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
}
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings,
- rt6i_siblings) {
+ &rt->fib6_siblings,
+ fib6_siblings) {
err = fib6_del(sibling, info);
if (err)
goto out_unlock;
@@ -3093,7 +3232,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
out_unlock:
spin_unlock_bh(&table->tb6_lock);
out_put:
- ip6_rt_put(rt);
+ fib6_info_release(rt);
if (skb) {
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
@@ -3102,11 +3241,28 @@ out_put:
return err;
}
+static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+{
+ int rc = -ESRCH;
+
+ if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
+ goto out;
+
+ if (cfg->fc_flags & RTF_GATEWAY &&
+ !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+ goto out;
+ if (dst_hold_safe(&rt->dst))
+ rc = rt6_remove_exception_rt(rt);
+out:
+ return rc;
+}
+
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt, *rt_cache;
+ struct rt6_info *rt_cache;
struct fib6_table *table;
+ struct fib6_info *rt;
struct fib6_node *fn;
int err = -ESRCH;
@@ -3126,25 +3282,31 @@ static int ip6_route_del(struct fib6_config *cfg,
if (fn) {
for_each_fib6_node_rt_rcu(fn) {
if (cfg->fc_flags & RTF_CACHE) {
+ int rc;
+
rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
&cfg->fc_src);
- if (!rt_cache)
- continue;
- rt = rt_cache;
+ if (rt_cache) {
+ rc = ip6_del_cached_rt(rt_cache, cfg);
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
+ }
+ }
+ continue;
}
if (cfg->fc_ifindex &&
- (!rt->dst.dev ||
- rt->dst.dev->ifindex != cfg->fc_ifindex))
+ (!rt->fib6_nh.nh_dev ||
+ rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
continue;
if (cfg->fc_flags & RTF_GATEWAY &&
- !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+ !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+ if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
continue;
- if (!dst_hold_safe(&rt->dst))
- break;
+ fib6_info_hold(rt);
rcu_read_unlock();
/* if gateway was specified only delete the one hop */
@@ -3166,6 +3328,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
struct ndisc_options ndopts;
struct inet6_dev *in6_dev;
struct neighbour *neigh;
+ struct fib6_info *from;
struct rd_msg *msg;
int optlen, on_link;
u8 *lladdr;
@@ -3247,7 +3410,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NEIGH_UPDATE_F_ISROUTER)),
NDISC_REDIRECT, &ndopts);
- nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+ fib6_info_hold(from);
+ rcu_read_unlock();
+
+ nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -3255,14 +3423,13 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (on_link)
nrt->rt6i_flags &= ~RTF_GATEWAY;
- nrt->rt6i_protocol = RTPROT_REDIRECT;
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
/* No need to remove rt from the exception table if rt is
* a cached route because rt6_insert_exception() will
* takes care of it
*/
- if (rt6_insert_exception(nrt, rt)) {
+ if (rt6_insert_exception(nrt, from)) {
dst_release_immediate(&nrt->dst);
goto out;
}
@@ -3274,47 +3441,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
out:
+ fib6_info_release(from);
neigh_release(neigh);
}
-/*
- * Misc support functions
- */
-
-static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
-{
- BUG_ON(from->from);
-
- rt->rt6i_flags &= ~RTF_EXPIRES;
- dst_hold(&from->dst);
- rt->from = from;
- dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
-}
-
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
-{
- rt->dst.input = ort->dst.input;
- rt->dst.output = ort->dst.output;
- rt->rt6i_dst = ort->rt6i_dst;
- rt->dst.error = ort->dst.error;
- rt->rt6i_idev = ort->rt6i_idev;
- if (rt->rt6i_idev)
- in6_dev_hold(rt->rt6i_idev);
- rt->dst.lastuse = jiffies;
- rt->rt6i_gateway = ort->rt6i_gateway;
- rt->rt6i_flags = ort->rt6i_flags;
- rt6_set_from(rt, ort);
- rt->rt6i_metric = ort->rt6i_metric;
-#ifdef CONFIG_IPV6_SUBTREES
- rt->rt6i_src = ort->rt6i_src;
-#endif
- rt->rt6i_prefsrc = ort->rt6i_prefsrc;
- rt->rt6i_table = ort->rt6i_table;
- rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
-}
-
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev)
@@ -3322,7 +3454,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
int ifindex = dev->ifindex;
struct fib6_node *fn;
- struct rt6_info *rt = NULL;
+ struct fib6_info *rt = NULL;
struct fib6_table *table;
table = fib6_get_table(net, tb_id);
@@ -3335,13 +3467,13 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->dst.dev->ifindex != ifindex)
+ if (rt->fib6_nh.nh_dev->ifindex != ifindex)
continue;
- if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+ if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
- if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
continue;
- ip6_hold_safe(NULL, &rt, false);
+ fib6_info_hold(rt);
break;
}
out:
@@ -3349,7 +3481,7 @@ out:
return rt;
}
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev,
@@ -3362,6 +3494,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
RTF_UP | RTF_PREF(pref),
.fc_protocol = RTPROT_RA,
+ .fc_type = RTN_UNICAST,
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
.fc_nlinfo.nl_net = net,
@@ -3375,36 +3508,39 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
if (!prefixlen)
cfg.fc_flags |= RTF_DEFAULT;
- ip6_route_add(&cfg, NULL);
+ ip6_route_add(&cfg, GFP_ATOMIC, NULL);
return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
+struct fib6_info *rt6_get_dflt_router(struct net *net,
+ const struct in6_addr *addr,
+ struct net_device *dev)
{
u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
- struct rt6_info *rt;
+ struct fib6_info *rt;
struct fib6_table *table;
- table = fib6_get_table(dev_net(dev), tb_id);
+ table = fib6_get_table(net, tb_id);
if (!table)
return NULL;
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- if (dev == rt->dst.dev &&
- ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
- ipv6_addr_equal(&rt->rt6i_gateway, addr))
+ if (dev == rt->fib6_nh.nh_dev &&
+ ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+ ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
break;
}
if (rt)
- ip6_hold_safe(NULL, &rt, false);
+ fib6_info_hold(rt);
rcu_read_unlock();
return rt;
}
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
+ const struct in6_addr *gwaddr,
struct net_device *dev,
unsigned int pref)
{
@@ -3415,14 +3551,15 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
.fc_protocol = RTPROT_RA,
+ .fc_type = RTN_UNICAST,
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
- .fc_nlinfo.nl_net = dev_net(dev),
+ .fc_nlinfo.nl_net = net,
};
cfg.fc_gateway = *gwaddr;
- if (!ip6_route_add(&cfg, NULL)) {
+ if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
struct fib6_table *table;
table = fib6_get_table(dev_net(dev), cfg.fc_table);
@@ -3430,24 +3567,25 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
}
- return rt6_get_dflt_router(gwaddr, dev);
+ return rt6_get_dflt_router(net, gwaddr, dev);
}
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
+static void __rt6_purge_dflt_routers(struct net *net,
+ struct fib6_table *table)
{
- struct rt6_info *rt;
+ struct fib6_info *rt;
restart:
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
- if (dst_hold_safe(&rt->dst)) {
- rcu_read_unlock();
- ip6_del_rt(rt);
- } else {
- rcu_read_unlock();
- }
+ struct net_device *dev = fib6_info_nh_dev(rt);
+ struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
+
+ if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!idev || idev->cnf.accept_ra != 2)) {
+ fib6_info_hold(rt);
+ rcu_read_unlock();
+ ip6_del_rt(net, rt);
goto restart;
}
}
@@ -3468,7 +3606,7 @@ void rt6_purge_dflt_routers(struct net *net)
head = &net->ipv6.fib_table_hash[h];
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
- __rt6_purge_dflt_routers(table);
+ __rt6_purge_dflt_routers(net, table);
}
}
@@ -3489,6 +3627,7 @@ static void rtmsg_to_fib6_config(struct net *net,
cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
cfg->fc_src_len = rtmsg->rtmsg_src_len;
cfg->fc_flags = rtmsg->rtmsg_flags;
+ cfg->fc_type = rtmsg->rtmsg_type;
cfg->fc_nlinfo.nl_net = net;
@@ -3518,7 +3657,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
rtnl_lock();
switch (cmd) {
case SIOCADDRT:
- err = ip6_route_add(&cfg, NULL);
+ err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
break;
case SIOCDELRT:
err = ip6_route_del(&cfg, NULL);
@@ -3546,7 +3685,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
case IPSTATS_MIB_INNOROUTES:
type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
if (type == IPV6_ADDR_ANY) {
- IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ IP6_INC_STATS(dev_net(dst->dev),
+ __in6_dev_get_safely(skb->dev),
IPSTATS_MIB_INADDRERRORS);
break;
}
@@ -3587,40 +3727,40 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
* Allocate a dst for local (unicast / anycast) address.
*/
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
- const struct in6_addr *addr,
- bool anycast)
+struct fib6_info *addrconf_f6i_alloc(struct net *net,
+ struct inet6_dev *idev,
+ const struct in6_addr *addr,
+ bool anycast, gfp_t gfp_flags)
{
u32 tb_id;
- struct net *net = dev_net(idev->dev);
struct net_device *dev = idev->dev;
- struct rt6_info *rt;
+ struct fib6_info *f6i;
- rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
- if (!rt)
+ f6i = fib6_info_alloc(gfp_flags);
+ if (!f6i)
return ERR_PTR(-ENOMEM);
- in6_dev_hold(idev);
-
- rt->dst.flags |= DST_HOST;
- rt->dst.input = ip6_input;
- rt->dst.output = ip6_output;
- rt->rt6i_idev = idev;
-
- rt->rt6i_protocol = RTPROT_KERNEL;
- rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
- if (anycast)
- rt->rt6i_flags |= RTF_ANYCAST;
- else
- rt->rt6i_flags |= RTF_LOCAL;
+ f6i->dst_nocount = true;
+ f6i->dst_host = true;
+ f6i->fib6_protocol = RTPROT_KERNEL;
+ f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
+ if (anycast) {
+ f6i->fib6_type = RTN_ANYCAST;
+ f6i->fib6_flags |= RTF_ANYCAST;
+ } else {
+ f6i->fib6_type = RTN_LOCAL;
+ f6i->fib6_flags |= RTF_LOCAL;
+ }
- rt->rt6i_gateway = *addr;
- rt->rt6i_dst.addr = *addr;
- rt->rt6i_dst.plen = 128;
+ f6i->fib6_nh.nh_gw = *addr;
+ dev_hold(dev);
+ f6i->fib6_nh.nh_dev = dev;
+ f6i->fib6_dst.addr = *addr;
+ f6i->fib6_dst.plen = 128;
tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
- rt->rt6i_table = fib6_get_table(net, tb_id);
+ f6i->fib6_table = fib6_get_table(net, tb_id);
- return rt;
+ return f6i;
}
/* remove deleted ip from prefsrc entries */
@@ -3630,18 +3770,18 @@ struct arg_dev_net_ip {
struct in6_addr *addr;
};
-static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->dst.dev == dev || !dev) &&
- rt != net->ipv6.ip6_null_entry &&
- ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+ if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
+ rt != net->ipv6.fib6_null_entry &&
+ ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
spin_lock_bh(&rt6_exception_lock);
/* remove prefsrc entry */
- rt->rt6i_prefsrc.plen = 0;
+ rt->fib6_prefsrc.plen = 0;
/* need to update cache as well */
rt6_exceptions_remove_prefsrc(rt);
spin_unlock_bh(&rt6_exception_lock);
@@ -3663,12 +3803,12 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
/* Remove routers and update dst entries when gateway turn into host. */
-static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
+static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
- if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+ if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+ ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
return -1;
}
@@ -3694,85 +3834,85 @@ struct arg_netdev_event {
};
};
-static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
struct fib6_node *fn;
- fn = rcu_dereference_protected(rt->rt6i_node,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
iter = rcu_dereference_protected(fn->leaf,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
- if (iter->rt6i_metric == rt->rt6i_metric &&
- rt6_qualify_for_ecmp(iter))
+ if (iter->fib6_metric == rt->fib6_metric &&
+ iter->fib6_nsiblings)
return iter;
- iter = rcu_dereference_protected(iter->rt6_next,
- lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ iter = rcu_dereference_protected(iter->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
return NULL;
}
-static bool rt6_is_dead(const struct rt6_info *rt)
+static bool rt6_is_dead(const struct fib6_info *rt)
{
- if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
- (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
- rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+ if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
+ fib6_ignore_linkdown(rt)))
return true;
return false;
}
-static int rt6_multipath_total_weight(const struct rt6_info *rt)
+static int rt6_multipath_total_weight(const struct fib6_info *rt)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
int total = 0;
if (!rt6_is_dead(rt))
- total += rt->rt6i_nh_weight;
+ total += rt->fib6_nh.nh_weight;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
- total += iter->rt6i_nh_weight;
+ total += iter->fib6_nh.nh_weight;
}
return total;
}
-static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
{
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
- *weight += rt->rt6i_nh_weight;
+ *weight += rt->fib6_nh.nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
- atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+ atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
}
-static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
int weight = 0;
rt6_upper_bound_set(rt, &weight, total);
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
rt6_upper_bound_set(iter, &weight, total);
}
-void rt6_multipath_rebalance(struct rt6_info *rt)
+void rt6_multipath_rebalance(struct fib6_info *rt)
{
- struct rt6_info *first;
+ struct fib6_info *first;
int total;
/* In case the entire multipath route was marked for flushing,
* then there is no need to rebalance upon the removal of every
* sibling route.
*/
- if (!rt->rt6i_nsiblings || rt->should_flush)
+ if (!rt->fib6_nsiblings || rt->should_flush)
return;
/* During lookup routes are evaluated in order, so we need to
@@ -3787,14 +3927,14 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
rt6_multipath_upper_bound_set(first, total);
}
-static int fib6_ifup(struct rt6_info *rt, void *p_arg)
+static int fib6_ifup(struct fib6_info *rt, void *p_arg)
{
const struct arg_netdev_event *arg = p_arg;
- const struct net *net = dev_net(arg->dev);
+ struct net *net = dev_net(arg->dev);
- if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
- rt->rt6i_nh_flags &= ~arg->nh_flags;
- fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+ if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
+ rt->fib6_nh.nh_flags &= ~arg->nh_flags;
+ fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@@ -3816,95 +3956,96 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}
-static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
+static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
const struct net_device *dev)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
- if (rt->dst.dev == dev)
+ if (rt->fib6_nh.nh_dev == dev)
return true;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
- if (iter->dst.dev == dev)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh.nh_dev == dev)
return true;
return false;
}
-static void rt6_multipath_flush(struct rt6_info *rt)
+static void rt6_multipath_flush(struct fib6_info *rt)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
rt->should_flush = 1;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
iter->should_flush = 1;
}
-static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
+static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
const struct net_device *down_dev)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
unsigned int dead = 0;
- if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh.nh_dev == down_dev ||
+ rt->fib6_nh.nh_flags & RTNH_F_DEAD)
dead++;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
- if (iter->dst.dev == down_dev ||
- iter->rt6i_nh_flags & RTNH_F_DEAD)
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh.nh_dev == down_dev ||
+ iter->fib6_nh.nh_flags & RTNH_F_DEAD)
dead++;
return dead;
}
-static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
+static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
const struct net_device *dev,
unsigned int nh_flags)
{
- struct rt6_info *iter;
+ struct fib6_info *iter;
- if (rt->dst.dev == dev)
- rt->rt6i_nh_flags |= nh_flags;
- list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
- if (iter->dst.dev == dev)
- iter->rt6i_nh_flags |= nh_flags;
+ if (rt->fib6_nh.nh_dev == dev)
+ rt->fib6_nh.nh_flags |= nh_flags;
+ list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
+ if (iter->fib6_nh.nh_dev == dev)
+ iter->fib6_nh.nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
-static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
+static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
{
const struct arg_netdev_event *arg = p_arg;
const struct net_device *dev = arg->dev;
- const struct net *net = dev_net(dev);
+ struct net *net = dev_net(dev);
- if (rt == net->ipv6.ip6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry)
return 0;
switch (arg->event) {
case NETDEV_UNREGISTER:
- return rt->dst.dev == dev ? -1 : 0;
+ return rt->fib6_nh.nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
- if (!rt->rt6i_nsiblings)
- return rt->dst.dev == dev ? -1 : 0;
+ if (!rt->fib6_nsiblings)
+ return rt->fib6_nh.nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
count = rt6_multipath_dead_count(rt, dev);
- if (rt->rt6i_nsiblings + 1 == count) {
+ if (rt->fib6_nsiblings + 1 == count) {
rt6_multipath_flush(rt);
return -1;
}
rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
RTNH_F_LINKDOWN);
- fib6_update_sernum(rt);
+ fib6_update_sernum(net, rt);
rt6_multipath_rebalance(rt);
}
return -2;
case NETDEV_CHANGE:
- if (rt->dst.dev != dev ||
- rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+ if (rt->fib6_nh.nh_dev != dev ||
+ rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
- rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@@ -3936,7 +4077,7 @@ struct rt6_mtu_change_arg {
unsigned int mtu;
};
-static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -3956,12 +4097,15 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
Since RFC 1981 doesn't include administrative MTU increase
update PMTU increase is a MUST. (i.e. jumbo frame)
*/
- if (rt->dst.dev == arg->dev &&
- !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+ if (rt->fib6_nh.nh_dev == arg->dev &&
+ !fib6_metric_locked(rt, RTAX_MTU)) {
+ u32 mtu = rt->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+
spin_lock_bh(&rt6_exception_lock);
- if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
- rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
- dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
spin_unlock_bh(&rt6_exception_lock);
}
@@ -3993,6 +4137,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_UID] = { .type = NLA_U32 },
[RTA_MARK] = { .type = NLA_U32 },
[RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_IP_PROTO] = { .type = NLA_U8 },
+ [RTA_SPORT] = { .type = NLA_U16 },
+ [RTA_DPORT] = { .type = NLA_U16 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4122,9 +4269,8 @@ errout:
}
struct rt6_nh {
- struct rt6_info *rt6_info;
+ struct fib6_info *fib6_info;
struct fib6_config r_cfg;
- struct mx6_config mxc;
struct list_head next;
};
@@ -4139,23 +4285,25 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
}
}
-static int ip6_route_info_append(struct list_head *rt6_nh_list,
- struct rt6_info *rt, struct fib6_config *r_cfg)
+static int ip6_route_info_append(struct net *net,
+ struct list_head *rt6_nh_list,
+ struct fib6_info *rt,
+ struct fib6_config *r_cfg)
{
struct rt6_nh *nh;
int err = -EEXIST;
list_for_each_entry(nh, rt6_nh_list, next) {
- /* check if rt6_info already exists */
- if (rt6_duplicate_nexthop(nh->rt6_info, rt))
+ /* check if fib6_info already exists */
+ if (rt6_duplicate_nexthop(nh->fib6_info, rt))
return err;
}
nh = kzalloc(sizeof(*nh), GFP_KERNEL);
if (!nh)
return -ENOMEM;
- nh->rt6_info = rt;
- err = ip6_convert_metrics(&nh->mxc, r_cfg);
+ nh->fib6_info = rt;
+ err = ip6_convert_metrics(net, rt, r_cfg);
if (err) {
kfree(nh);
return err;
@@ -4166,8 +4314,8 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
return 0;
}
-static void ip6_route_mpath_notify(struct rt6_info *rt,
- struct rt6_info *rt_last,
+static void ip6_route_mpath_notify(struct fib6_info *rt,
+ struct fib6_info *rt_last,
struct nl_info *info,
__u16 nlflags)
{
@@ -4177,10 +4325,10 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
- if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
- rt = list_first_entry(&rt_last->rt6i_siblings,
- struct rt6_info,
- rt6i_siblings);
+ if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
+ rt = list_first_entry(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
if (rt)
@@ -4190,11 +4338,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
static int ip6_route_multipath_add(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+ struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
- struct rt6_info *rt;
+ struct fib6_info *rt;
struct rt6_nh *err_nh;
struct rt6_nh *nh, *nh_safe;
__u16 nlflags;
@@ -4214,7 +4362,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = (struct rtnexthop *)cfg->fc_mp;
/* Parse a Multipath Entry and build a list (rt6_nh_list) of
- * rt6_info structs per nexthop
+ * fib6_info structs per nexthop
*/
while (rtnh_ok(rtnh, remaining)) {
memcpy(&r_cfg, cfg, sizeof(*cfg));
@@ -4237,18 +4385,19 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
}
r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
- rt = ip6_route_info_create(&r_cfg, extack);
+ rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
goto cleanup;
}
- rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+ rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
- err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
+ err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
+ rt, &r_cfg);
if (err) {
- dst_release_immediate(&rt->dst);
+ fib6_info_release(rt);
goto cleanup;
}
@@ -4263,14 +4412,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
err_nh = NULL;
list_for_each_entry(nh, &rt6_nh_list, next) {
- rt_last = nh->rt6_info;
- err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
- /* save reference to first route for notification */
- if (!rt_notif && !err)
- rt_notif = nh->rt6_info;
-
- /* nh->rt6_info is used or freed at this point, reset to NULL*/
- nh->rt6_info = NULL;
+ err = __ip6_ins_rt(nh->fib6_info, info, extack);
+ fib6_info_release(nh->fib6_info);
+
+ if (!err) {
+ /* save reference to last route successfully inserted */
+ rt_last = nh->fib6_info;
+
+ /* save reference to first route for notification */
+ if (!rt_notif)
+ rt_notif = nh->fib6_info;
+ }
+
+ /* nh->fib6_info is used or freed at this point, reset to NULL*/
+ nh->fib6_info = NULL;
if (err) {
if (replace && nhn)
ip6_print_replace_route_err(&rt6_nh_list);
@@ -4287,6 +4442,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
nhn++;
}
@@ -4311,9 +4467,8 @@ add_errout:
cleanup:
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
- if (nh->rt6_info)
- dst_release_immediate(&nh->rt6_info->dst);
- kfree(nh->mxc.mx);
+ if (nh->fib6_info)
+ fib6_info_release(nh->fib6_info);
list_del(&nh->next);
kfree(nh);
}
@@ -4390,20 +4545,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (cfg.fc_mp)
return ip6_route_multipath_add(&cfg, extack);
else
- return ip6_route_add(&cfg, extack);
+ return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct fib6_info *rt)
{
int nexthop_len = 0;
- if (rt->rt6i_nsiblings) {
+ if (rt->fib6_nsiblings) {
nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
+ NLA_ALIGN(sizeof(struct rtnexthop))
+ nla_total_size(16) /* RTA_GATEWAY */
- + lwtunnel_get_encap_size(rt->dst.lwtstate);
+ + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
- nexthop_len *= rt->rt6i_nsiblings;
+ nexthop_len *= rt->fib6_nsiblings;
}
return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4419,38 +4574,41 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->dst.lwtstate)
+ + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
+ nexthop_len;
}
-static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
unsigned int *flags, bool skip_oif)
{
- if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
*flags |= RTNH_F_DEAD;
- if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
+ if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
*flags |= RTNH_F_LINKDOWN;
- if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+
+ rcu_read_lock();
+ if (fib6_ignore_linkdown(rt))
*flags |= RTNH_F_DEAD;
+ rcu_read_unlock();
}
- if (rt->rt6i_flags & RTF_GATEWAY) {
- if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+ if (rt->fib6_flags & RTF_GATEWAY) {
+ if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
goto nla_put_failure;
}
- *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
- if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
+ *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
+ if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
*flags |= RTNH_F_OFFLOAD;
/* not needed for multipath encoding b/c it has a rtnexthop struct */
- if (!skip_oif && rt->dst.dev &&
- nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ if (!skip_oif && rt->fib6_nh.nh_dev &&
+ nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
goto nla_put_failure;
- if (rt->dst.lwtstate &&
- lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+ if (rt->fib6_nh.nh_lwtstate &&
+ lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
goto nla_put_failure;
return 0;
@@ -4460,8 +4618,9 @@ nla_put_failure:
}
/* add multipath next hop */
-static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
{
+ const struct net_device *dev = rt->fib6_nh.nh_dev;
struct rtnexthop *rtnh;
unsigned int flags = 0;
@@ -4469,8 +4628,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
if (!rtnh)
goto nla_put_failure;
- rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
- rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+ rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
+ rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
goto nla_put_failure;
@@ -4486,16 +4645,16 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int rt6_fill_node(struct net *net,
- struct sk_buff *skb, struct rt6_info *rt,
- struct in6_addr *dst, struct in6_addr *src,
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+ struct fib6_info *rt, struct dst_entry *dst,
+ struct in6_addr *dest, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags)
{
- u32 metrics[RTAX_MAX];
struct rtmsg *rtm;
struct nlmsghdr *nlh;
- long expires;
+ long expires = 0;
+ u32 *pmetrics;
u32 table;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
@@ -4504,53 +4663,31 @@ static int rt6_fill_node(struct net *net,
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
- rtm->rtm_dst_len = rt->rt6i_dst.plen;
- rtm->rtm_src_len = rt->rt6i_src.plen;
+ rtm->rtm_dst_len = rt->fib6_dst.plen;
+ rtm->rtm_src_len = rt->fib6_src.plen;
rtm->rtm_tos = 0;
- if (rt->rt6i_table)
- table = rt->rt6i_table->tb6_id;
+ if (rt->fib6_table)
+ table = rt->fib6_table->tb6_id;
else
table = RT6_TABLE_UNSPEC;
rtm->rtm_table = table;
if (nla_put_u32(skb, RTA_TABLE, table))
goto nla_put_failure;
- if (rt->rt6i_flags & RTF_REJECT) {
- switch (rt->dst.error) {
- case -EINVAL:
- rtm->rtm_type = RTN_BLACKHOLE;
- break;
- case -EACCES:
- rtm->rtm_type = RTN_PROHIBIT;
- break;
- case -EAGAIN:
- rtm->rtm_type = RTN_THROW;
- break;
- default:
- rtm->rtm_type = RTN_UNREACHABLE;
- break;
- }
- }
- else if (rt->rt6i_flags & RTF_LOCAL)
- rtm->rtm_type = RTN_LOCAL;
- else if (rt->rt6i_flags & RTF_ANYCAST)
- rtm->rtm_type = RTN_ANYCAST;
- else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
- rtm->rtm_type = RTN_LOCAL;
- else
- rtm->rtm_type = RTN_UNICAST;
+
+ rtm->rtm_type = rt->fib6_type;
rtm->rtm_flags = 0;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = rt->rt6i_protocol;
+ rtm->rtm_protocol = rt->fib6_protocol;
- if (rt->rt6i_flags & RTF_CACHE)
+ if (rt->fib6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
- if (dst) {
- if (nla_put_in6_addr(skb, RTA_DST, dst))
+ if (dest) {
+ if (nla_put_in6_addr(skb, RTA_DST, dest))
goto nla_put_failure;
rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
- if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
+ if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
if (src) {
@@ -4558,12 +4695,12 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
rtm->rtm_src_len = 128;
} else if (rtm->rtm_src_len &&
- nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
+ nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
goto nla_put_failure;
#endif
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
- if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+ if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
int err = ip6mr_get_route(net, skb, rtm, portid);
if (err == 0)
@@ -4574,34 +4711,32 @@ static int rt6_fill_node(struct net *net,
#endif
if (nla_put_u32(skb, RTA_IIF, iif))
goto nla_put_failure;
- } else if (dst) {
+ } else if (dest) {
struct in6_addr saddr_buf;
- if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
+ if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
goto nla_put_failure;
}
- if (rt->rt6i_prefsrc.plen) {
+ if (rt->fib6_prefsrc.plen) {
struct in6_addr saddr_buf;
- saddr_buf = rt->rt6i_prefsrc.addr;
+ saddr_buf = rt->fib6_prefsrc.addr;
if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
goto nla_put_failure;
}
- memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
- if (rt->rt6i_pmtu)
- metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
- if (rtnetlink_put_metrics(skb, metrics) < 0)
+ pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
+ if (rtnetlink_put_metrics(skb, pmetrics) < 0)
goto nla_put_failure;
- if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
+ if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
goto nla_put_failure;
/* For multipath routes, walk the siblings list and add
* each as a nexthop within RTA_MULTIPATH.
*/
- if (rt->rt6i_nsiblings) {
- struct rt6_info *sibling, *next_sibling;
+ if (rt->fib6_nsiblings) {
+ struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
mp = nla_nest_start(skb, RTA_MULTIPATH);
@@ -4612,7 +4747,7 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
- &rt->rt6i_siblings, rt6i_siblings) {
+ &rt->fib6_siblings, fib6_siblings) {
if (rt6_add_nexthop(skb, sibling) < 0)
goto nla_put_failure;
}
@@ -4623,12 +4758,15 @@ static int rt6_fill_node(struct net *net,
goto nla_put_failure;
}
- expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
+ if (rt->fib6_flags & RTF_EXPIRES) {
+ expires = dst ? dst->expires : rt->expires;
+ expires -= jiffies;
+ }
- if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
+ if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
- if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
+ if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
goto nla_put_failure;
@@ -4640,12 +4778,12 @@ nla_put_failure:
return -EMSGSIZE;
}
-int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+int rt6_dump_route(struct fib6_info *rt, void *p_arg)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
struct net *net = arg->net;
- if (rt == net->ipv6.ip6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry)
return 0;
if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
@@ -4653,16 +4791,15 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
/* user wants prefix routes only */
if (rtm->rtm_flags & RTM_F_PREFIX &&
- !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
/* success since this is not a prefix route */
return 1;
}
}
- return rt6_fill_node(net,
- arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
- NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
+ return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
}
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4671,6 +4808,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[RTA_MAX+1];
int err, iif = 0, oif = 0;
+ struct fib6_info *from;
struct dst_entry *dst;
struct rt6_info *rt;
struct sk_buff *skb;
@@ -4718,6 +4856,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
else
fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+ if (tb[RTA_SPORT])
+ fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+ if (tb[RTA_DPORT])
+ fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &fl6.flowi6_proto, extack);
+ if (err)
+ goto errout;
+ }
+
if (iif) {
struct net_device *dev;
int flags = 0;
@@ -4759,14 +4910,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
goto errout;
}
- if (fibmatch && rt->from) {
- struct rt6_info *ort = rt->from;
-
- dst_hold(&ort->dst);
- ip6_rt_put(rt);
- rt = ort;
- }
-
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
ip6_rt_put(rt);
@@ -4775,14 +4918,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
skb_dst_set(skb, &rt->dst);
+
+ rcu_read_lock();
+ from = rcu_dereference(rt->from);
+
if (fibmatch)
- err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
+ err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0);
else
- err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
- RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0);
+ err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+ &fl6.saddr, iif, RTM_NEWROUTE,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ 0);
+ rcu_read_unlock();
+
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -4793,7 +4943,7 @@ errout:
return err;
}
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
unsigned int nlm_flags)
{
struct sk_buff *skb;
@@ -4808,8 +4958,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
if (!skb)
goto errout;
- err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
- event, info->portid, seq, nlm_flags);
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ event, info->portid, seq, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -4834,6 +4984,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
+ net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5010,11 +5161,17 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
+ net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.fib6_null_entry)
+ goto out_ip6_dst_entries;
+
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
GFP_KERNEL);
if (!net->ipv6.ip6_null_entry)
- goto out_ip6_dst_entries;
+ goto out_fib6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
@@ -5061,6 +5218,8 @@ out_ip6_prohibit_entry:
out_ip6_null_entry:
kfree(net->ipv6.ip6_null_entry);
#endif
+out_fib6_null_entry:
+ kfree(net->ipv6.fib6_null_entry);
out_ip6_dst_entries:
dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
@@ -5069,6 +5228,7 @@ out_ip6_dst_ops:
static void __net_exit ip6_route_net_exit(struct net *net)
{
+ kfree(net->ipv6.fib6_null_entry);
kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
kfree(net->ipv6.ip6_prohibit_entry);
@@ -5141,6 +5301,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
+ init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 7f5621d09571..0fdf2a55e746 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -226,7 +226,6 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
nla_put_failure:
rcu_read_unlock();
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -ENOMEM;
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index bf4763fd68c2..19ccf0dc996c 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev,
rcu_read_unlock();
}
+/* Compute flowlabel for outer IPv6 header */
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
+ struct ipv6hdr *inner_hdr)
+{
+ int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
+ __be32 flowlabel = 0;
+ u32 hash;
+
+ if (do_flowlabel > 0) {
+ hash = skb_get_hash(skb);
+ rol32(hash, 16);
+ flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+ } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
+ flowlabel = ip6_flowlabel(inner_hdr);
+ }
+ return flowlabel;
+}
+
/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
{
@@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
struct ipv6hdr *hdr, *inner_hdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, tot_len, err;
+ __be32 flowlabel;
hdrlen = (osrh->hdrlen + 1) << 3;
tot_len = hdrlen + sizeof(*hdr);
@@ -108,6 +127,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
return err;
inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
skb_push(skb, tot_len);
skb_reset_network_header(skb);
@@ -121,10 +141,10 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
if (skb->protocol == htons(ETH_P_IPV6)) {
ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
- ip6_flowlabel(inner_hdr));
+ flowlabel);
hdr->hop_limit = inner_hdr->hop_limit;
} else {
- ip6_flow_hdr(hdr, 0, 0);
+ ip6_flow_hdr(hdr, 0, flowlabel);
hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
}
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 45722327375a..cd6e4cab63f6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
/*
* SR-IPv6 implementation
*
- * Author:
+ * Authors:
* David Lebrun <david.lebrun@uclouvain.be>
+ * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
*
*
* This program is free software; you can redistribute it and/or
@@ -30,7 +31,9 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <net/seg6_local.h>
#include <linux/etherdevice.h>
+#include <linux/bpf.h>
struct seg6_local_lwt;
@@ -41,6 +44,11 @@ struct seg6_action_desc {
int static_headroom;
};
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
struct seg6_local_lwt {
int action;
struct ipv6_sr_hdr *srh;
@@ -49,6 +57,7 @@ struct seg6_local_lwt {
struct in6_addr nh6;
int iif;
int oif;
+ struct bpf_lwt_prog bpf;
int headroom;
struct seg6_action_desc *desc;
@@ -140,8 +149,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
*daddr = *addr;
}
-static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
- u32 tbl_id)
+int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+ u32 tbl_id)
{
struct net *net = dev_net(skb->dev);
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -187,6 +196,7 @@ out:
skb_dst_drop(skb);
skb_dst_set(skb, dst);
+ return dst->error;
}
/* regular endpoint function */
@@ -200,7 +210,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- lookup_nexthop(skb, NULL, 0);
+ seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
@@ -220,7 +230,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- lookup_nexthop(skb, &slwt->nh6, 0);
+ seg6_lookup_nexthop(skb, &slwt->nh6, 0);
return dst_input(skb);
@@ -239,7 +249,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- lookup_nexthop(skb, NULL, slwt->table);
+ seg6_lookup_nexthop(skb, NULL, slwt->table);
return dst_input(skb);
@@ -331,7 +341,7 @@ static int input_action_end_dx6(struct sk_buff *skb,
if (!ipv6_addr_any(&slwt->nh6))
nhaddr = &slwt->nh6;
- lookup_nexthop(skb, nhaddr, 0);
+ seg6_lookup_nexthop(skb, nhaddr, 0);
return dst_input(skb);
drop:
@@ -380,7 +390,7 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- lookup_nexthop(skb, NULL, slwt->table);
+ seg6_lookup_nexthop(skb, NULL, slwt->table);
return dst_input(skb);
@@ -406,7 +416,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
- lookup_nexthop(skb, NULL, 0);
+ seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
@@ -438,7 +448,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
- lookup_nexthop(skb, NULL, 0);
+ seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
@@ -447,6 +457,71 @@ drop:
return err;
}
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
+static int input_action_end_bpf(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct seg6_bpf_srh_state local_srh_state;
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+ int ret;
+
+ srh = get_and_validate_srh(skb);
+ if (!srh)
+ goto drop;
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ /* preempt_disable is needed to protect the per-CPU buffer srh_state,
+ * which is also accessed by the bpf_lwt_seg6_* helpers
+ */
+ preempt_disable();
+ srh_state->hdrlen = srh->hdrlen << 3;
+ srh_state->valid = 1;
+
+ rcu_read_lock();
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+ rcu_read_unlock();
+
+ local_srh_state = *srh_state;
+ preempt_enable();
+
+ switch (ret) {
+ case BPF_OK:
+ case BPF_REDIRECT:
+ break;
+ case BPF_DROP:
+ goto drop;
+ default:
+ pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+ goto drop;
+ }
+
+ if (unlikely((local_srh_state.hdrlen & 7) != 0))
+ goto drop;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ goto drop;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
+
+ if (!local_srh_state.valid &&
+ unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+ goto drop;
+
+ if (ret != BPF_REDIRECT)
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
static struct seg6_action_desc seg6_action_table[] = {
{
.action = SEG6_LOCAL_ACTION_END,
@@ -493,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
.attrs = (1 << SEG6_LOCAL_SRH),
.input = input_action_end_b6_encap,
.static_headroom = sizeof(struct ipv6hdr),
- }
+ },
+ {
+ .action = SEG6_LOCAL_ACTION_END_BPF,
+ .attrs = (1 << SEG6_LOCAL_BPF),
+ .input = input_action_end_bpf,
+ },
+
};
static struct seg6_action_desc *__get_action_desc(int action)
@@ -538,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
.len = sizeof(struct in6_addr) },
[SEG6_LOCAL_IIF] = { .type = NLA_U32 },
[SEG6_LOCAL_OIF] = { .type = NLA_U32 },
+ [SEG6_LOCAL_BPF] = { .type = NLA_NESTED },
};
static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -715,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
+ [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, },
+ [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+ struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
+ attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
+ return -EINVAL;
+
+ slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
+ if (!slwt->bpf.name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
+ p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+ if (IS_ERR(p)) {
+ kfree(slwt->bpf.name);
+ return PTR_ERR(p);
+ }
+
+ slwt->bpf.prog = p;
+ return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct nlattr *nest;
+
+ if (!slwt->bpf.prog)
+ return 0;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
+ return -EMSGSIZE;
+
+ if (slwt->bpf.name &&
+ nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ if (!a->bpf.name && !b->bpf.name)
+ return 0;
+
+ if (!a->bpf.name || !b->bpf.name)
+ return 1;
+
+ return strcmp(a->bpf.name, b->bpf.name);
+}
+
struct seg6_action_param {
int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -745,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_OIF] = { .parse = parse_nla_oif,
.put = put_nla_oif,
.cmp = cmp_nla_oif },
+
+ [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
+ .put = put_nla_bpf,
+ .cmp = cmp_nla_bpf },
+
};
static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -830,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
kfree(slwt->srh);
+
+ if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
+ kfree(slwt->bpf.name);
+ bpf_prog_put(slwt->bpf.prog);
+ }
+
+ return;
}
static int seg6_local_fill_encap(struct sk_buff *skb,
@@ -882,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
if (attrs & (1 << SEG6_LOCAL_OIF))
nlsize += nla_total_size(4);
+ if (attrs & (1 << SEG6_LOCAL_BPF))
+ nlsize += nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) +
+ nla_total_size(4);
+
return nlsize;
}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbdef630152..e15cd37024fd 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "seg6_flowlabel",
+ .data = &init_net.ipv6.sysctl.seg6_flowlabel,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
+ ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d2ce66b23430..b620d9b72e59 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
unsigned int tot_len = sizeof(struct tcphdr);
struct dst_entry *dst;
__be32 *topt;
+ __u32 mark = 0;
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif;
}
- fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+ if (sk)
+ mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
@@ -1318,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb));
+ tcp_rcv_established(sk, skb);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 00e2112da26d..164afd31aebf 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -285,9 +285,7 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
-#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
- IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
- IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
@@ -546,10 +544,10 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
}
-static struct static_key udpv6_encap_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
void udpv6_encap_enable(void)
{
- static_key_enable(&udpv6_encap_needed);
+ static_branch_enable(&udpv6_encap_needed_key);
}
EXPORT_SYMBOL(udpv6_encap_enable);
@@ -561,7 +559,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
- if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -1023,7 +1021,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
* Sending
*/
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+ struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct udphdr *uh;
@@ -1042,12 +1041,32 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
uh->len = htons(len);
uh->check = 0;
+ if (cork->gso_size) {
+ const int hlen = skb_network_header_len(skb) +
+ sizeof(struct udphdr);
+
+ if (hlen + cork->gso_size > cork->fragsize)
+ return -EINVAL;
+ if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ return -EINVAL;
+ if (udp_sk(sk)->no_check6_tx)
+ return -EINVAL;
+ if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+ dst_xfrm(skb_dst(skb)))
+ return -EIO;
+
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ goto csum_partial;
+ }
+
if (is_udplite)
csum = udplite_csum(skb);
else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
goto send;
} else
@@ -1093,7 +1112,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
out:
up->len = 0;
@@ -1127,6 +1146,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.hlimit = -1;
ipc6.tclass = -1;
ipc6.dontfrag = -1;
+ ipc6.gso_size = up->gso_size;
sockc.tsflags = sk->sk_tsflags;
/* destination address check */
@@ -1259,7 +1279,10 @@ do_udp_sendmsg:
opt->tot_len = sizeof(*opt);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+ err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+ if (err > 0)
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ &ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1291,6 +1314,29 @@ do_udp_sendmsg:
fl6.saddr = np->saddr;
fl6.fl6_sport = inet->inet_sport;
+ if (cgroup_bpf_enabled && !connected) {
+ err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+ (struct sockaddr *)sin6, &fl6.saddr);
+ if (err)
+ goto out_no_dst;
+ if (sin6) {
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+ /* BPF program rewrote IPv6-only by IPv4-mapped
+ * IPv6. It's currently unsupported.
+ */
+ err = -ENOTSUPP;
+ goto out_no_dst;
+ }
+ if (sin6->sin6_port == 0) {
+ /* BPF program set invalid port. Reject it. */
+ err = -EINVAL;
+ goto out_no_dst;
+ }
+ fl6.fl6_dport = sin6->sin6_port;
+ fl6.daddr = sin6->sin6_addr;
+ }
+ }
+
final_p = fl6_update_dst(&fl6, opt, &final);
if (final_p)
connected = false;
@@ -1324,15 +1370,16 @@ back_from_confirm:
/* Lockless fast path for the non-corking case */
if (!corkreq) {
+ struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc);
+ msg->msg_flags, &cork, &sockc);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6);
+ err = udp_v6_send_skb(skb, &fl6, &cork.base);
goto out;
}
@@ -1369,6 +1416,7 @@ do_append_data:
out:
dst_release(dst);
+out_no_dst:
fl6_sock_release(flowlabel);
txopt_put(opt_to_free);
if (!err)
@@ -1402,7 +1450,7 @@ void udpv6_destroy_sock(struct sock *sk)
udp_v6_flush_pending_frames(sk);
release_sock(sk);
- if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+ if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
void (*encap_destroy)(struct sock *sk);
encap_destroy = READ_ONCE(up->encap_destroy);
if (encap_destroy)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..03a2ff3fe1e6 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -42,12 +42,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
const struct ipv6hdr *ipv6h;
struct udphdr *uh;
- if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
goto out;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+ return __udp_gso_segment(skb, features);
+
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
*/
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 86dba282a147..ef3defaf43b9 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
* it was magically lost, so this code needs audit */
xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
RTF_LOCAL);
- xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
- xdst->u.rt6.rt6i_node = rt->rt6i_node;
xdst->route_cookie = rt6_get_cookie(rt);
xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 16f434791763..5bdca3d5d6b7 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -60,11 +60,9 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
static int
__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
{
- int i;
+ int count[XFRM_MAX_DEPTH] = { };
int class[XFRM_MAX_DEPTH];
- int count[maxclass];
-
- memset(count, 0, sizeof(count));
+ int i;
for (i = 0; i < n; i++) {
int c;
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 7f1e842ef05a..e87686f7d63c 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -57,6 +57,10 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->session)
+ l2tp_session_dec_refcount(pd->session);
+
pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
pd->session_idx++;
@@ -105,11 +109,16 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
if (!pd || pd == SEQ_START_TOKEN)
return;
- /* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */
+ /* Drop reference taken by last invocation of l2tp_dfs_next_session()
+ * or l2tp_dfs_next_tunnel().
+ */
+ if (pd->session) {
+ l2tp_session_dec_refcount(pd->session);
+ pd->session = NULL;
+ }
if (pd->tunnel) {
l2tp_tunnel_dec_refcount(pd->tunnel);
pd->tunnel = NULL;
- pd->session = NULL;
}
}
@@ -250,13 +259,10 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
goto out;
}
- /* Show the tunnel or session context */
- if (!pd->session) {
+ if (!pd->session)
l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
- } else {
+ else
l2tp_dfs_seq_session_show(m, pd->session);
- l2tp_session_dec_refcount(pd->session);
- }
out:
return 0;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 3d8ca1231f8f..b56cb1df4fc0 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -428,16 +428,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
*/
static void pppol2tp_session_close(struct l2tp_session *session)
{
- struct pppol2tp_session *ps;
-
- ps = l2tp_session_priv(session);
- mutex_lock(&ps->sk_lock);
- ps->__sk = rcu_dereference_protected(ps->sk,
- lockdep_is_held(&ps->sk_lock));
- RCU_INIT_POINTER(ps->sk, NULL);
- if (ps->__sk)
- call_rcu(&ps->rcu, pppol2tp_put_sk);
- mutex_unlock(&ps->sk_lock);
}
/* Really kill the session socket. (Called from sock_put() if
@@ -480,15 +470,24 @@ static int pppol2tp_release(struct socket *sock)
sock_orphan(sk);
sock->sk = NULL;
- /* If the socket is associated with a session,
- * l2tp_session_delete will call pppol2tp_session_close which
- * will drop the session's ref on the socket.
- */
session = pppol2tp_sock_to_session(sk);
if (session) {
+ struct pppol2tp_session *ps;
+
l2tp_session_delete(session);
- /* drop the ref obtained by pppol2tp_sock_to_session */
- sock_put(sk);
+
+ ps = l2tp_session_priv(session);
+ mutex_lock(&ps->sk_lock);
+ ps->__sk = rcu_dereference_protected(ps->sk,
+ lockdep_is_held(&ps->sk_lock));
+ RCU_INIT_POINTER(ps->sk, NULL);
+ mutex_unlock(&ps->sk_lock);
+ call_rcu(&ps->rcu, pppol2tp_put_sk);
+
+ /* Rely on the sock_put() call at the end of the function for
+ * dropping the reference held by pppol2tp_sock_to_session().
+ * The last reference will be dropped by pppol2tp_put_sk().
+ */
}
release_sock(sk);
@@ -742,7 +741,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
*/
mutex_lock(&ps->sk_lock);
if (rcu_dereference_protected(ps->sk,
- lockdep_is_held(&ps->sk_lock))) {
+ lockdep_is_held(&ps->sk_lock)) ||
+ ps->__sk) {
mutex_unlock(&ps->sk_lock);
error = -EEXIST;
goto end;
@@ -803,7 +803,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
out_no_ppp:
/* This is how we get the session context from the socket. */
- sock_hold(sk);
sk->sk_user_data = session;
rcu_assign_pointer(ps->sk, sk);
mutex_unlock(&ps->sk_lock);
@@ -1576,6 +1575,10 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
{
+ /* Drop reference taken during previous invocation */
+ if (pd->session)
+ l2tp_session_dec_refcount(pd->session);
+
pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
pd->session_idx++;
@@ -1624,11 +1627,16 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v)
if (!pd || pd == SEQ_START_TOKEN)
return;
- /* Drop reference taken by last invocation of pppol2tp_next_tunnel() */
+ /* Drop reference taken by last invocation of pppol2tp_next_session()
+ * or pppol2tp_next_tunnel().
+ */
+ if (pd->session) {
+ l2tp_session_dec_refcount(pd->session);
+ pd->session = NULL;
+ }
if (pd->tunnel) {
l2tp_tunnel_dec_refcount(pd->tunnel);
pd->tunnel = NULL;
- pd->session = NULL;
}
}
@@ -1723,14 +1731,10 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
goto out;
}
- /* Show the tunnel or session context.
- */
- if (!pd->session) {
+ if (!pd->session)
pppol2tp_seq_tunnel_show(m, pd->tunnel);
- } else {
+ else
pppol2tp_seq_session_show(m, pd->session);
- l2tp_session_dec_refcount(pd->session);
- }
out:
return 0;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 85dbaa891059..bdf6fa78d0d2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -695,7 +695,7 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
if (sta) {
ret = 0;
memcpy(mac, sta->sta.addr, ETH_ALEN);
- sta_set_sinfo(sta, sinfo);
+ sta_set_sinfo(sta, sinfo, true);
}
mutex_unlock(&local->sta_mtx);
@@ -724,7 +724,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
sta = sta_info_get_bss(sdata, mac);
if (sta) {
ret = 0;
- sta_set_sinfo(sta, sinfo);
+ sta_set_sinfo(sta, sinfo, true);
}
mutex_unlock(&local->sta_mtx);
@@ -2376,6 +2376,11 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
(WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+ if (changed & (WIPHY_PARAM_TXQ_LIMIT |
+ WIPHY_PARAM_TXQ_MEMORY_LIMIT |
+ WIPHY_PARAM_TXQ_QUANTUM))
+ ieee80211_txq_set_params(local);
+
return 0;
}
@@ -3705,6 +3710,99 @@ static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
return 0;
}
+void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
+ struct txq_info *txqi)
+{
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES);
+ txqstats->backlog_bytes = txqi->tin.backlog_bytes;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS);
+ txqstats->backlog_packets = txqi->tin.backlog_packets;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS);
+ txqstats->flows = txqi->tin.flows;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS);
+ txqstats->drops = txqi->cstats.drop_count;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS);
+ txqstats->ecn_marks = txqi->cstats.ecn_mark;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT);
+ txqstats->overlimit = txqi->tin.overlimit;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS);
+ txqstats->collisions = txqi->tin.collisions;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES);
+ txqstats->tx_bytes = txqi->tin.tx_bytes;
+ }
+
+ if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) {
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS);
+ txqstats->tx_packets = txqi->tin.tx_packets;
+ }
+}
+
+static int ieee80211_get_txq_stats(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_txq_stats *txqstats)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata;
+ int ret = 0;
+
+ if (!local->ops->wake_tx_queue)
+ return 1;
+
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ if (wdev) {
+ sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ if (!sdata->vif.txq) {
+ ret = 1;
+ goto out;
+ }
+ ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq));
+ } else {
+ /* phy stats */
+ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) |
+ BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) |
+ BIT(NL80211_TXQ_STATS_OVERLIMIT) |
+ BIT(NL80211_TXQ_STATS_OVERMEMORY) |
+ BIT(NL80211_TXQ_STATS_COLLISIONS) |
+ BIT(NL80211_TXQ_STATS_MAX_FLOWS);
+ txqstats->backlog_packets = local->fq.backlog;
+ txqstats->backlog_bytes = local->fq.memory_usage;
+ txqstats->overlimit = local->fq.overlimit;
+ txqstats->overmemory = local->fq.overmemory;
+ txqstats->collisions = local->fq.collisions;
+ txqstats->max_flows = local->fq.flows_cnt;
+ }
+
+out:
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+
+ return ret;
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3798,4 +3896,5 @@ const struct cfg80211_ops mac80211_config_ops = {
.del_nan_func = ieee80211_del_nan_func,
.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
.tx_control_port = ieee80211_tx_control_port,
+ .get_txq_stats = ieee80211_get_txq_stats,
};
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 4d82fe7d627c..8f6998091d26 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -2,6 +2,7 @@
/*
* Portions of this file
* Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
*/
#ifndef __MAC80211_DRIVER_OPS
@@ -813,7 +814,8 @@ drv_allow_buffered_frames(struct ieee80211_local *local,
}
static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
+ struct ieee80211_sub_if_data *sdata,
+ u16 duration)
{
might_sleep();
@@ -821,9 +823,9 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
return;
WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
- trace_drv_mgd_prepare_tx(local, sdata);
+ trace_drv_mgd_prepare_tx(local, sdata, duration);
if (local->ops->mgd_prepare_tx)
- local->ops->mgd_prepare_tx(&local->hw, &sdata->vif);
+ local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration);
trace_drv_return_void(local);
}
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 9cc986deda61..690c142a7a44 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -4,6 +4,7 @@
* Copied from cfg.c - originally
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2014 Intel Corporation (Author: Johannes Berg)
+ * Copyright (C) 2018 Intel Corporation
*
* This file is GPLv2 as found in COPYING.
*/
@@ -106,8 +107,8 @@ static void ieee80211_get_stats(struct net_device *dev,
if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
goto do_survey;
- sinfo.filled = 0;
- sta_set_sinfo(sta, &sinfo);
+ memset(&sinfo, 0, sizeof(sinfo));
+ sta_set_sinfo(sta, &sinfo, false);
i = 0;
ADD_STA_STATS(sta);
@@ -116,11 +117,11 @@ static void ieee80211_get_stats(struct net_device *dev,
if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
- data[i] = 100000 *
+ data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.txrate);
i++;
if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
- data[i] = 100000 *
+ data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.rxrate);
i++;
@@ -133,8 +134,8 @@ static void ieee80211_get_stats(struct net_device *dev,
if (sta->sdata->dev != dev)
continue;
- sinfo.filled = 0;
- sta_set_sinfo(sta, &sinfo);
+ memset(&sinfo, 0, sizeof(sinfo));
+ sta_set_sinfo(sta, &sinfo, false);
i = 0;
ADD_STA_STATS(sta);
}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c78036a0ac94..26a7ba3b698f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -301,26 +301,27 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
___ieee80211_stop_tx_ba_session(sta, i, reason);
mutex_unlock(&sta->ampdu_mlme.mtx);
- /* stopping might queue the work again - so cancel only afterwards */
- cancel_work_sync(&sta->ampdu_mlme.work);
-
/*
* In case the tear down is part of a reconfigure due to HW restart
* request, it is possible that the low level driver requested to stop
* the BA session, so handle it to properly clean tid_tx data.
*/
- mutex_lock(&sta->ampdu_mlme.mtx);
- for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
- struct tid_ampdu_tx *tid_tx =
- rcu_dereference_protected_tid_tx(sta, i);
+ if(reason == AGG_STOP_DESTROY_STA) {
+ cancel_work_sync(&sta->ampdu_mlme.work);
- if (!tid_tx)
- continue;
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ struct tid_ampdu_tx *tid_tx =
+ rcu_dereference_protected_tid_tx(sta, i);
- if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
- ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
+ if (!tid_tx)
+ continue;
+
+ if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
+ ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
+ }
+ mutex_unlock(&sta->ampdu_mlme.mtx);
}
- mutex_unlock(&sta->ampdu_mlme.mtx);
}
void ieee80211_ba_session_work(struct work_struct *work)
@@ -328,16 +329,11 @@ void ieee80211_ba_session_work(struct work_struct *work)
struct sta_info *sta =
container_of(work, struct sta_info, ampdu_mlme.work);
struct tid_ampdu_tx *tid_tx;
+ bool blocked;
int tid;
- /*
- * When this flag is set, new sessions should be
- * blocked, and existing sessions will be torn
- * down by the code that set the flag, so this
- * need not run.
- */
- if (test_sta_flag(sta, WLAN_STA_BLOCK_BA))
- return;
+ /* When this flag is set, new sessions should be blocked. */
+ blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA);
mutex_lock(&sta->ampdu_mlme.mtx);
for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
@@ -352,7 +348,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
sta, tid, WLAN_BACK_RECIPIENT,
WLAN_REASON_UNSPECIFIED, true);
- if (test_and_clear_bit(tid,
+ if (!blocked &&
+ test_and_clear_bit(tid,
sta->ampdu_mlme.tid_rx_manage_offl))
___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
IEEE80211_MAX_AMPDU_BUF,
@@ -367,7 +364,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
spin_lock_bh(&sta->lock);
tid_tx = sta->ampdu_mlme.tid_start_tx[tid];
- if (tid_tx) {
+ if (!blocked && tid_tx) {
/*
* Assign it over to the normal tid_tx array
* where it "goes live".
@@ -390,7 +387,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
if (!tid_tx)
continue;
- if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
+ if (!blocked &&
+ test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
ieee80211_start_tx_ba_cb(sta, tid, tid_tx);
if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state))
___ieee80211_stop_tx_ba_session(sta, tid,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6372dbdadf53..d1978aa1c15d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2012,6 +2012,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
}
int ieee80211_txq_setup_flows(struct ieee80211_local *local);
+void ieee80211_txq_set_params(struct ieee80211_local *local);
void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
@@ -2020,6 +2021,8 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
struct txq_info *txqi);
void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
+void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
+ struct txq_info *txqi);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
u16 transaction, u16 auth_alg, u16 status,
const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 9ea17afaa237..4d2e797e3f16 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -565,6 +565,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
if (!ops->set_key)
wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
+ if (ops->wake_tx_queue)
+ wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
+
wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 233068756502..a59187c016e0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -864,7 +864,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
return;
}
- drv_mgd_prepare_tx(local, sdata);
+ drv_mgd_prepare_tx(local, sdata, 0);
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
@@ -2022,7 +2022,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
*/
if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) &&
!ifmgd->have_beacon)
- drv_mgd_prepare_tx(sdata->local, sdata);
+ drv_mgd_prepare_tx(sdata->local, sdata, 0);
ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
reason, tx, frame_buf);
@@ -2560,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
if (!elems.challenge)
return;
auth_data->expected_transaction = 4;
- drv_mgd_prepare_tx(sdata->local, sdata);
+ drv_mgd_prepare_tx(sdata->local, sdata, 0);
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
IEEE80211_TX_INTFL_MLME_CONN_TX;
@@ -3769,6 +3769,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
u32 tx_flags = 0;
u16 trans = 1;
u16 status = 0;
+ u16 prepare_tx_duration = 0;
sdata_assert_lock(sdata);
@@ -3790,7 +3791,11 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
return -ETIMEDOUT;
}
- drv_mgd_prepare_tx(local, sdata);
+ if (auth_data->algorithm == WLAN_AUTH_SAE)
+ prepare_tx_duration =
+ jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE);
+
+ drv_mgd_prepare_tx(local, sdata, prepare_tx_duration);
sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
auth_data->bss->bssid, auth_data->tries,
@@ -4994,7 +4999,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
req->bssid, req->reason_code,
ieee80211_get_reason_code_string(req->reason_code));
- drv_mgd_prepare_tx(sdata->local, sdata);
+ drv_mgd_prepare_tx(sdata->local, sdata, 0);
ieee80211_send_deauth_disassoc(sdata, req->bssid,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
@@ -5014,7 +5019,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
req->bssid, req->reason_code,
ieee80211_get_reason_code_string(req->reason_code));
- drv_mgd_prepare_tx(sdata->local, sdata);
+ drv_mgd_prepare_tx(sdata->local, sdata, 0);
ieee80211_send_deauth_disassoc(sdata, req->bssid,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 03102aff0953..0a38cc1cbebc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -5,6 +5,7 @@
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -97,27 +98,27 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
*/
static void remove_monitor_info(struct sk_buff *skb,
unsigned int present_fcs_len,
- unsigned int rtap_vendor_space)
+ unsigned int rtap_space)
{
if (present_fcs_len)
__pskb_trim(skb, skb->len - present_fcs_len);
- __pskb_pull(skb, rtap_vendor_space);
+ __pskb_pull(skb, rtap_space);
}
static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
- unsigned int rtap_vendor_space)
+ unsigned int rtap_space)
{
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
struct ieee80211_hdr *hdr;
- hdr = (void *)(skb->data + rtap_vendor_space);
+ hdr = (void *)(skb->data + rtap_space);
if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
RX_FLAG_FAILED_PLCP_CRC |
RX_FLAG_ONLY_MONITOR))
return true;
- if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
+ if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
return true;
if (ieee80211_is_ctl(hdr->frame_control) &&
@@ -199,7 +200,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
- int rtap_vendor_space)
+ int rtap_space)
{
struct {
struct ieee80211_hdr_3addr hdr;
@@ -212,14 +213,14 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
- if (skb->len < rtap_vendor_space + sizeof(action) +
+ if (skb->len < rtap_space + sizeof(action) +
VHT_MUMIMO_GROUPS_DATA_LEN)
return;
if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
return;
- skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
+ skb_copy_bits(skb, rtap_space, &action, sizeof(action));
if (!ieee80211_is_action(action.hdr.frame_control))
return;
@@ -545,7 +546,7 @@ static struct sk_buff *
ieee80211_make_monitor_skb(struct ieee80211_local *local,
struct sk_buff **origskb,
struct ieee80211_rate *rate,
- int rtap_vendor_space, bool use_origskb)
+ int rtap_space, bool use_origskb)
{
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb);
int rt_hdrlen, needed_headroom;
@@ -553,7 +554,7 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local,
/* room for the radiotap header based on driver features */
rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb);
- needed_headroom = rt_hdrlen - rtap_vendor_space;
+ needed_headroom = rt_hdrlen - rtap_space;
if (use_origskb) {
/* only need to expand headroom if necessary */
@@ -607,7 +608,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
struct ieee80211_sub_if_data *sdata;
struct sk_buff *monskb = NULL;
int present_fcs_len = 0;
- unsigned int rtap_vendor_space = 0;
+ unsigned int rtap_space = 0;
struct ieee80211_sub_if_data *monitor_sdata =
rcu_dereference(local->monitor_sdata);
bool only_monitor = false;
@@ -615,7 +616,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
- rtap_vendor_space = sizeof(*rtap) + rtap->len + rtap->pad;
+ rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
}
/*
@@ -638,13 +639,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
}
/* ensure hdr->frame_control and vendor radiotap data are in skb head */
- if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) {
+ if (!pskb_may_pull(origskb, 2 + rtap_space)) {
dev_kfree_skb(origskb);
return NULL;
}
- only_monitor = should_drop_frame(origskb, present_fcs_len,
- rtap_vendor_space);
+ only_monitor = should_drop_frame(origskb, present_fcs_len, rtap_space);
if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
if (only_monitor) {
@@ -652,12 +652,11 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
return NULL;
}
- remove_monitor_info(origskb, present_fcs_len,
- rtap_vendor_space);
+ remove_monitor_info(origskb, present_fcs_len, rtap_space);
return origskb;
}
- ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
+ ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space);
list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) {
bool last_monitor = list_is_last(&sdata->u.mntr.list,
@@ -665,8 +664,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (!monskb)
monskb = ieee80211_make_monitor_skb(local, &origskb,
- rate,
- rtap_vendor_space,
+ rate, rtap_space,
only_monitor &&
last_monitor);
@@ -698,7 +696,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
if (!origskb)
return NULL;
- remove_monitor_info(origskb, present_fcs_len, rtap_vendor_space);
+ remove_monitor_info(origskb, present_fcs_len, rtap_space);
return origskb;
}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 655c3d8b0d80..6428f1ac37b6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -3,6 +3,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -357,6 +358,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
sta->last_connected = ktime_get_seconds();
ewma_signal_init(&sta->rx_stats_avg.signal);
+ ewma_avg_signal_init(&sta->status_stats.avg_ack_signal);
for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++)
ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]);
@@ -1006,7 +1008,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
if (sinfo)
- sta_set_sinfo(sta, sinfo);
+ sta_set_sinfo(sta, sinfo, true);
cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
kfree(sinfo);
@@ -1992,7 +1994,6 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
int band = STA_STATS_GET(LEGACY_BAND, rate);
int rate_idx = STA_STATS_GET(LEGACY_IDX, rate);
- rinfo->flags = 0;
sband = local->hw.wiphy->bands[band];
brate = sband->bitrates[rate_idx].bitrate;
if (rinfo->bw == RATE_INFO_BW_5)
@@ -2051,6 +2052,18 @@ static void sta_set_tidstats(struct sta_info *sta,
tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid];
}
+
+ if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) {
+ spin_lock_bh(&local->fq.lock);
+ rcu_read_lock();
+
+ tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS);
+ ieee80211_fill_txq_stats(&tidstats->txq_stats,
+ to_txq_info(sta->sta.txq[tid]));
+
+ rcu_read_unlock();
+ spin_unlock_bh(&local->fq.lock);
+ }
}
static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
@@ -2066,7 +2079,8 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
return value;
}
-void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
+ bool tidstats)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
@@ -2220,11 +2234,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
}
- sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
- for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
- struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
+ if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+ struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
- sta_set_tidstats(sta, tidstats, i);
+ sta_set_tidstats(sta, tidstats, i);
+ }
}
if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -2294,6 +2309,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
sinfo->ack_signal = sta->status_stats.last_ack_signal;
sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
}
+
+ if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) &&
+ !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) {
+ sinfo->avg_ack_signal =
+ -(s8)ewma_avg_signal_read(
+ &sta->status_stats.avg_ack_signal);
+ sinfo->filled |=
+ BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG);
+ }
}
u32 sta_get_expected_throughput(struct sta_info *sta)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index f64eb86ca64b..81b35f623792 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -119,6 +119,7 @@ enum ieee80211_sta_info_flags {
#define HT_AGG_STATE_START_CB 6
#define HT_AGG_STATE_STOP_CB 7
+DECLARE_EWMA(avg_signal, 10, 8)
enum ieee80211_agg_stop_reason {
AGG_STOP_DECLINED,
AGG_STOP_LOCAL_REQUEST,
@@ -550,6 +551,7 @@ struct sta_info {
unsigned long last_ack;
s8 last_ack_signal;
bool ack_signal_filled;
+ struct ewma_avg_signal avg_ack_signal;
} status_stats;
/* Updated from TX path only, no locking requirements */
@@ -742,7 +744,8 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
void sta_set_rate_info_tx(struct sta_info *sta,
const struct ieee80211_tx_rate *rate,
struct rate_info *rinfo);
-void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
+ bool tidstats);
u32 sta_get_expected_throughput(struct sta_info *sta);
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 743e89c5926c..9a6d7208bf4f 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -195,6 +195,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
sta->status_stats.last_ack_signal =
(s8)txinfo->status.ack_signal;
sta->status_stats.ack_signal_filled = true;
+ ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
+ -txinfo->status.ack_signal);
}
}
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 591ad02e1fa4..80a7edf8d314 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2,6 +2,7 @@
/*
* Portions of this file
* Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
*/
#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -1413,11 +1414,29 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
TP_ARGS(local, sta, tids, num_frames, reason, more_data)
);
-DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx,
+TRACE_EVENT(drv_mgd_prepare_tx,
TP_PROTO(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata),
+ struct ieee80211_sub_if_data *sdata,
+ u16 duration),
- TP_ARGS(local, sdata)
+ TP_ARGS(local, sdata, duration),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, duration)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->duration = duration;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " duration: %u",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration
+ )
);
DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 05a265cd573d..44b5dfe8727d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1460,6 +1460,24 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
}
+void ieee80211_txq_set_params(struct ieee80211_local *local)
+{
+ if (local->hw.wiphy->txq_limit)
+ local->fq.limit = local->hw.wiphy->txq_limit;
+ else
+ local->hw.wiphy->txq_limit = local->fq.limit;
+
+ if (local->hw.wiphy->txq_memory_limit)
+ local->fq.memory_limit = local->hw.wiphy->txq_memory_limit;
+ else
+ local->hw.wiphy->txq_memory_limit = local->fq.memory_limit;
+
+ if (local->hw.wiphy->txq_quantum)
+ local->fq.quantum = local->hw.wiphy->txq_quantum;
+ else
+ local->hw.wiphy->txq_quantum = local->fq.quantum;
+}
+
int ieee80211_txq_setup_flows(struct ieee80211_local *local)
{
struct fq *fq = &local->fq;
@@ -1509,6 +1527,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
for (i = 0; i < fq->flows_cnt; i++)
codel_vars_init(&local->cvars[i]);
+ ieee80211_txq_set_params(local);
+
return 0;
}
@@ -4085,6 +4105,31 @@ unlock:
}
EXPORT_SYMBOL(ieee80211_csa_update_counter);
+void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct beacon_data *beacon = NULL;
+
+ rcu_read_lock();
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ beacon = rcu_dereference(sdata->u.ap.beacon);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ beacon = rcu_dereference(sdata->u.ibss.presp);
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+ if (!beacon)
+ goto unlock;
+
+ if (counter < beacon->csa_current_counter)
+ beacon->csa_current_counter = counter;
+
+unlock:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_csa_set_counter);
+
bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 11f9cfc016d9..2d82c88efd0b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2793,12 +2793,13 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
memset(&ri, 0, sizeof(ri));
+ ri.bw = status->bw;
+
/* Fill cfg80211 rate info */
switch (status->encoding) {
case RX_ENC_HT:
ri.mcs = status->rate_idx;
ri.flags |= RATE_INFO_FLAGS_MCS;
- ri.bw = status->bw;
if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
break;
@@ -2806,7 +2807,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
ri.mcs = status->rate_idx;
ri.nss = status->nss;
- ri.bw = status->bw;
if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
break;
@@ -2818,8 +2818,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
int shift = 0;
int bitrate;
- ri.bw = status->bw;
-
switch (status->bw) {
case RATE_INFO_BW_10:
shift = 1;
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8da84312cd3b..8055e3965cef 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,15 +68,6 @@ enum {
NCSI_MODE_MAX
};
-enum {
- NCSI_FILTER_BASE = 0,
- NCSI_FILTER_VLAN = 0,
- NCSI_FILTER_UC,
- NCSI_FILTER_MC,
- NCSI_FILTER_MIXED,
- NCSI_FILTER_MAX
-};
-
struct ncsi_channel_version {
u32 version; /* Supported BCD encoded NCSI version */
u32 alpha2; /* Supported BCD encoded NCSI version */
@@ -98,11 +89,18 @@ struct ncsi_channel_mode {
u32 data[8]; /* Data entries */
};
-struct ncsi_channel_filter {
- u32 index; /* Index of channel filters */
- u32 total; /* Total entries in the filter table */
- u64 bitmap; /* Bitmap of valid entries */
- u32 data[]; /* Data for the valid entries */
+struct ncsi_channel_mac_filter {
+ u8 n_uc;
+ u8 n_mc;
+ u8 n_mixed;
+ u64 bitmap;
+ unsigned char *addrs;
+};
+
+struct ncsi_channel_vlan_filter {
+ u8 n_vids;
+ u64 bitmap;
+ u16 *vids;
};
struct ncsi_channel_stats {
@@ -186,7 +184,9 @@ struct ncsi_channel {
struct ncsi_channel_version version;
struct ncsi_channel_cap caps[NCSI_CAP_MAX];
struct ncsi_channel_mode modes[NCSI_MODE_MAX];
- struct ncsi_channel_filter *filters[NCSI_FILTER_MAX];
+ /* Filtering Settings */
+ struct ncsi_channel_mac_filter mac_filter;
+ struct ncsi_channel_vlan_filter vlan_filter;
struct ncsi_channel_stats stats;
struct {
struct timer_list timer;
@@ -320,10 +320,6 @@ extern spinlock_t ncsi_dev_lock;
list_for_each_entry_rcu(nc, &np->channels, node)
/* Resources */
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index);
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
void ncsi_start_channel_monitor(struct ncsi_channel *nc);
void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c3695ba0cf94..5561e221b71f 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -27,125 +27,6 @@
LIST_HEAD(ncsi_dev_list);
DEFINE_SPINLOCK(ncsi_dev_lock);
-static inline int ncsi_filter_size(int table)
-{
- int sizes[] = { 2, 6, 6, 6 };
-
- BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
- if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
- return -EINVAL;
-
- return sizes[table];
-}
-
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
-{
- struct ncsi_channel_filter *ncf;
- int size;
-
- ncf = nc->filters[table];
- if (!ncf)
- return NULL;
-
- size = ncsi_filter_size(table);
- if (size < 0)
- return NULL;
-
- return ncf->data + size * index;
-}
-
-/* Find the first active filter in a filter table that matches the given
- * data parameter. If data is NULL, this returns the first active filter.
- */
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
-{
- struct ncsi_channel_filter *ncf;
- void *bitmap;
- int index, size;
- unsigned long flags;
-
- ncf = nc->filters[table];
- if (!ncf)
- return -ENXIO;
-
- size = ncsi_filter_size(table);
- if (size < 0)
- return size;
-
- spin_lock_irqsave(&nc->lock, flags);
- bitmap = (void *)&ncf->bitmap;
- index = -1;
- while ((index = find_next_bit(bitmap, ncf->total, index + 1))
- < ncf->total) {
- if (!data || !memcmp(ncf->data + size * index, data, size)) {
- spin_unlock_irqrestore(&nc->lock, flags);
- return index;
- }
- }
- spin_unlock_irqrestore(&nc->lock, flags);
-
- return -ENOENT;
-}
-
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
-{
- struct ncsi_channel_filter *ncf;
- int index, size;
- void *bitmap;
- unsigned long flags;
-
- size = ncsi_filter_size(table);
- if (size < 0)
- return size;
-
- index = ncsi_find_filter(nc, table, data);
- if (index >= 0)
- return index;
-
- ncf = nc->filters[table];
- if (!ncf)
- return -ENODEV;
-
- spin_lock_irqsave(&nc->lock, flags);
- bitmap = (void *)&ncf->bitmap;
- do {
- index = find_next_zero_bit(bitmap, ncf->total, 0);
- if (index >= ncf->total) {
- spin_unlock_irqrestore(&nc->lock, flags);
- return -ENOSPC;
- }
- } while (test_and_set_bit(index, bitmap));
-
- memcpy(ncf->data + size * index, data, size);
- spin_unlock_irqrestore(&nc->lock, flags);
-
- return index;
-}
-
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
-{
- struct ncsi_channel_filter *ncf;
- int size;
- void *bitmap;
- unsigned long flags;
-
- size = ncsi_filter_size(table);
- if (size < 0)
- return size;
-
- ncf = nc->filters[table];
- if (!ncf || index >= ncf->total)
- return -ENODEV;
-
- spin_lock_irqsave(&nc->lock, flags);
- bitmap = (void *)&ncf->bitmap;
- if (test_and_clear_bit(index, bitmap))
- memset(ncf->data + size * index, 0, size);
- spin_unlock_irqrestore(&nc->lock, flags);
-
- return 0;
-}
-
static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
{
struct ncsi_dev *nd = &ndp->ndev;
@@ -339,20 +220,13 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
static void ncsi_remove_channel(struct ncsi_channel *nc)
{
struct ncsi_package *np = nc->package;
- struct ncsi_channel_filter *ncf;
unsigned long flags;
- int i;
- /* Release filters */
spin_lock_irqsave(&nc->lock, flags);
- for (i = 0; i < NCSI_FILTER_MAX; i++) {
- ncf = nc->filters[i];
- if (!ncf)
- continue;
- nc->filters[i] = NULL;
- kfree(ncf);
- }
+ /* Release filters */
+ kfree(nc->mac_filter.addrs);
+ kfree(nc->vlan_filter.vids);
nc->state = NCSI_CHANNEL_INACTIVE;
spin_unlock_irqrestore(&nc->lock, flags);
@@ -670,32 +544,26 @@ error:
static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
struct ncsi_cmd_arg *nca)
{
+ struct ncsi_channel_vlan_filter *ncf;
+ unsigned long flags;
+ void *bitmap;
int index;
- u32 *data;
u16 vid;
- index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL);
- if (index < 0) {
- /* Filter table empty */
- return -1;
- }
+ ncf = &nc->vlan_filter;
+ bitmap = &ncf->bitmap;
- data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index);
- if (!data) {
- netdev_err(ndp->ndev.dev,
- "NCSI: failed to retrieve filter %d\n", index);
- /* Set the VLAN id to 0 - this will still disable the entry in
- * the filter table, but we won't know what it was.
- */
- vid = 0;
- } else {
- vid = *(u16 *)data;
+ spin_lock_irqsave(&nc->lock, flags);
+ index = find_next_bit(bitmap, ncf->n_vids, 0);
+ if (index >= ncf->n_vids) {
+ spin_unlock_irqrestore(&nc->lock, flags);
+ return -1;
}
+ vid = ncf->vids[index];
- netdev_printk(KERN_DEBUG, ndp->ndev.dev,
- "NCSI: removed vlan tag %u at index %d\n",
- vid, index + 1);
- ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index);
+ clear_bit(index, bitmap);
+ ncf->vids[index] = 0;
+ spin_unlock_irqrestore(&nc->lock, flags);
nca->type = NCSI_PKT_CMD_SVF;
nca->words[1] = vid;
@@ -711,45 +579,55 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
struct ncsi_cmd_arg *nca)
{
+ struct ncsi_channel_vlan_filter *ncf;
struct vlan_vid *vlan = NULL;
- int index = 0;
+ unsigned long flags;
+ int i, index;
+ void *bitmap;
+ u16 vid;
+ if (list_empty(&ndp->vlan_vids))
+ return -1;
+
+ ncf = &nc->vlan_filter;
+ bitmap = &ncf->bitmap;
+
+ spin_lock_irqsave(&nc->lock, flags);
+
+ rcu_read_lock();
list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
- index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
- if (index < 0) {
- /* New tag to add */
- netdev_printk(KERN_DEBUG, ndp->ndev.dev,
- "NCSI: new vlan id to set: %u\n",
- vlan->vid);
+ vid = vlan->vid;
+ for (i = 0; i < ncf->n_vids; i++)
+ if (ncf->vids[i] == vid) {
+ vid = 0;
+ break;
+ }
+ if (vid)
break;
- }
- netdev_printk(KERN_DEBUG, ndp->ndev.dev,
- "vid %u already at filter pos %d\n",
- vlan->vid, index);
}
+ rcu_read_unlock();
- if (!vlan || index >= 0) {
- netdev_printk(KERN_DEBUG, ndp->ndev.dev,
- "no vlan ids left to set\n");
+ if (!vid) {
+ /* No VLAN ID is not set */
+ spin_unlock_irqrestore(&nc->lock, flags);
return -1;
}
- index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
- if (index < 0) {
+ index = find_next_zero_bit(bitmap, ncf->n_vids, 0);
+ if (index < 0 || index >= ncf->n_vids) {
netdev_err(ndp->ndev.dev,
- "Failed to add new VLAN tag, error %d\n", index);
- if (index == -ENOSPC)
- netdev_err(ndp->ndev.dev,
- "Channel %u already has all VLAN filters set\n",
- nc->id);
+ "Channel %u already has all VLAN filters set\n",
+ nc->id);
+ spin_unlock_irqrestore(&nc->lock, flags);
return -1;
}
- netdev_printk(KERN_DEBUG, ndp->ndev.dev,
- "NCSI: set vid %u in packet, index %u\n",
- vlan->vid, index + 1);
+ ncf->vids[index] = vid;
+ set_bit(index, bitmap);
+ spin_unlock_irqrestore(&nc->lock, flags);
+
nca->type = NCSI_PKT_CMD_SVF;
- nca->words[1] = vlan->vid;
+ nca->words[1] = vid;
/* HW filter index starts at 1 */
nca->bytes[6] = index + 1;
nca->bytes[7] = 0x01;
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 41cede4041d3..82e6edf9c5d9 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -58,10 +58,9 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
struct ncsi_dev_priv *ndp,
struct ncsi_channel *nc)
{
- struct nlattr *vid_nest;
- struct ncsi_channel_filter *ncf;
+ struct ncsi_channel_vlan_filter *ncf;
struct ncsi_channel_mode *m;
- u32 *data;
+ struct nlattr *vid_nest;
int i;
nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id);
@@ -79,18 +78,13 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST);
if (!vid_nest)
return -ENOMEM;
- ncf = nc->filters[NCSI_FILTER_VLAN];
+ ncf = &nc->vlan_filter;
i = -1;
- if (ncf) {
- while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total,
- i + 1)) < ncf->total) {
- data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i);
- /* Uninitialised channels will have 'zero' vlan ids */
- if (!data || !*data)
- continue;
+ while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids,
+ i + 1)) < ncf->n_vids) {
+ if (ncf->vids[i])
nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID,
- *(u16 *)data);
- }
+ ncf->vids[i]);
}
nla_nest_end(skb, vid_nest);
@@ -207,7 +201,6 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
return genlmsg_reply(skb, info);
err:
- genlmsg_cancel(skb, hdr);
kfree_skb(skb);
return rc;
}
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index efd933ff5570..930c1d3796f0 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -334,9 +334,9 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
struct ncsi_rsp_pkt *rsp;
struct ncsi_dev_priv *ndp = nr->ndp;
struct ncsi_channel *nc;
- struct ncsi_channel_filter *ncf;
- unsigned short vlan;
- int ret;
+ struct ncsi_channel_vlan_filter *ncf;
+ unsigned long flags;
+ void *bitmap;
/* Find the package and channel */
rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -346,22 +346,23 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
return -ENODEV;
cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
- ncf = nc->filters[NCSI_FILTER_VLAN];
- if (!ncf)
- return -ENOENT;
- if (cmd->index >= ncf->total)
+ ncf = &nc->vlan_filter;
+ if (cmd->index == 0 || cmd->index > ncf->n_vids)
return -ERANGE;
- /* Add or remove the VLAN filter */
+ /* Add or remove the VLAN filter. Remember HW indexes from 1 */
+ spin_lock_irqsave(&nc->lock, flags);
+ bitmap = &ncf->bitmap;
if (!(cmd->enable & 0x1)) {
- /* HW indexes from 1 */
- ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index - 1);
+ if (test_and_clear_bit(cmd->index - 1, bitmap))
+ ncf->vids[cmd->index - 1] = 0;
} else {
- vlan = ntohs(cmd->vlan);
- ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+ set_bit(cmd->index - 1, bitmap);
+ ncf->vids[cmd->index - 1] = ntohs(cmd->vlan);
}
+ spin_unlock_irqrestore(&nc->lock, flags);
- return ret;
+ return 0;
}
static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
@@ -422,8 +423,12 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
struct ncsi_rsp_pkt *rsp;
struct ncsi_dev_priv *ndp = nr->ndp;
struct ncsi_channel *nc;
- struct ncsi_channel_filter *ncf;
+ struct ncsi_channel_mac_filter *ncf;
+ unsigned long flags;
void *bitmap;
+ bool enabled;
+ int index;
+
/* Find the package and channel */
rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -436,31 +441,24 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
* isn't supported yet.
*/
cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd);
- switch (cmd->at_e >> 5) {
- case 0x0: /* UC address */
- ncf = nc->filters[NCSI_FILTER_UC];
- break;
- case 0x1: /* MC address */
- ncf = nc->filters[NCSI_FILTER_MC];
- break;
- default:
- return -EINVAL;
- }
+ enabled = cmd->at_e & 0x1;
+ ncf = &nc->mac_filter;
+ bitmap = &ncf->bitmap;
- /* Sanity check on the filter */
- if (!ncf)
- return -ENOENT;
- else if (cmd->index >= ncf->total)
+ if (cmd->index == 0 ||
+ cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
return -ERANGE;
- bitmap = &ncf->bitmap;
- if (cmd->at_e & 0x1) {
- set_bit(cmd->index, bitmap);
- memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6);
+ index = (cmd->index - 1) * ETH_ALEN;
+ spin_lock_irqsave(&nc->lock, flags);
+ if (enabled) {
+ set_bit(cmd->index - 1, bitmap);
+ memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN);
} else {
- clear_bit(cmd->index, bitmap);
- memset(ncf->data + 6 * cmd->index, 0, 6);
+ clear_bit(cmd->index - 1, bitmap);
+ memset(&ncf->addrs[index], 0, ETH_ALEN);
}
+ spin_unlock_irqrestore(&nc->lock, flags);
return 0;
}
@@ -631,9 +629,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
struct ncsi_rsp_gc_pkt *rsp;
struct ncsi_dev_priv *ndp = nr->ndp;
struct ncsi_channel *nc;
- struct ncsi_channel_filter *ncf;
- size_t size, entry_size;
- int cnt, i;
+ size_t size;
/* Find the channel */
rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp);
@@ -655,64 +651,40 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode &
NCSI_CAP_VLAN_MASK;
- /* Build filters */
- for (i = 0; i < NCSI_FILTER_MAX; i++) {
- switch (i) {
- case NCSI_FILTER_VLAN:
- cnt = rsp->vlan_cnt;
- entry_size = 2;
- break;
- case NCSI_FILTER_MIXED:
- cnt = rsp->mixed_cnt;
- entry_size = 6;
- break;
- case NCSI_FILTER_MC:
- cnt = rsp->mc_cnt;
- entry_size = 6;
- break;
- case NCSI_FILTER_UC:
- cnt = rsp->uc_cnt;
- entry_size = 6;
- break;
- default:
- continue;
- }
-
- if (!cnt || nc->filters[i])
- continue;
-
- size = sizeof(*ncf) + cnt * entry_size;
- ncf = kzalloc(size, GFP_ATOMIC);
- if (!ncf) {
- pr_warn("%s: Cannot alloc filter table (%d)\n",
- __func__, i);
- return -ENOMEM;
- }
-
- ncf->index = i;
- ncf->total = cnt;
- if (i == NCSI_FILTER_VLAN) {
- /* Set VLAN filters active so they are cleared in
- * first configuration state
- */
- ncf->bitmap = U64_MAX;
- } else {
- ncf->bitmap = 0x0ul;
- }
- nc->filters[i] = ncf;
- }
+ size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN;
+ nc->mac_filter.addrs = kzalloc(size, GFP_ATOMIC);
+ if (!nc->mac_filter.addrs)
+ return -ENOMEM;
+ nc->mac_filter.n_uc = rsp->uc_cnt;
+ nc->mac_filter.n_mc = rsp->mc_cnt;
+ nc->mac_filter.n_mixed = rsp->mixed_cnt;
+
+ nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt,
+ sizeof(*nc->vlan_filter.vids),
+ GFP_ATOMIC);
+ if (!nc->vlan_filter.vids)
+ return -ENOMEM;
+ /* Set VLAN filters active so they are cleared in the first
+ * configuration state
+ */
+ nc->vlan_filter.bitmap = U64_MAX;
+ nc->vlan_filter.n_vids = rsp->vlan_cnt;
return 0;
}
static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
{
- struct ncsi_rsp_gp_pkt *rsp;
+ struct ncsi_channel_vlan_filter *ncvf;
+ struct ncsi_channel_mac_filter *ncmf;
struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_rsp_gp_pkt *rsp;
struct ncsi_channel *nc;
- unsigned short enable, vlan;
+ unsigned short enable;
unsigned char *pdata;
- int table, i;
+ unsigned long flags;
+ void *bitmap;
+ int i;
/* Find the channel */
rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp);
@@ -746,36 +718,33 @@ static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
/* MAC addresses filter table */
pdata = (unsigned char *)rsp + 48;
enable = rsp->mac_enable;
+ ncmf = &nc->mac_filter;
+ spin_lock_irqsave(&nc->lock, flags);
+ bitmap = &ncmf->bitmap;
for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) {
- if (i >= (nc->filters[NCSI_FILTER_UC]->total +
- nc->filters[NCSI_FILTER_MC]->total))
- table = NCSI_FILTER_MIXED;
- else if (i >= nc->filters[NCSI_FILTER_UC]->total)
- table = NCSI_FILTER_MC;
- else
- table = NCSI_FILTER_UC;
-
if (!(enable & (0x1 << i)))
- continue;
-
- if (ncsi_find_filter(nc, table, pdata) >= 0)
- continue;
+ clear_bit(i, bitmap);
+ else
+ set_bit(i, bitmap);
- ncsi_add_filter(nc, table, pdata);
+ memcpy(&ncmf->addrs[i * ETH_ALEN], pdata, ETH_ALEN);
}
+ spin_unlock_irqrestore(&nc->lock, flags);
/* VLAN filter table */
enable = ntohs(rsp->vlan_enable);
+ ncvf = &nc->vlan_filter;
+ bitmap = &ncvf->bitmap;
+ spin_lock_irqsave(&nc->lock, flags);
for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) {
if (!(enable & (0x1 << i)))
- continue;
-
- vlan = ntohs(*(__be16 *)pdata);
- if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0)
- continue;
+ clear_bit(i, bitmap);
+ else
+ set_bit(i, bitmap);
- ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+ ncvf->vids[i] = ntohs(*(__be16 *)pdata);
}
+ spin_unlock_irqrestore(&nc->lock, flags);
return 0;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 44d8a55e9721..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -433,17 +433,16 @@ config NF_NAT_TFTP
default NF_NAT && NF_CONNTRACK_TFTP
config NF_NAT_REDIRECT
- tristate "IPv4/IPv6 redirect support"
- depends on NF_NAT
- help
- This is the kernel functionality to redirect packets to local
- machine through NAT.
+ bool
config NETFILTER_SYNPROXY
tristate
endif # NF_CONNTRACK
+config NF_OSF
+ tristate
+
config NF_TABLES
select NETFILTER_NETLINK
tristate "Netfilter nf_tables support"
@@ -474,24 +473,6 @@ config NF_TABLES_NETDEV
help
This option enables support for the "netdev" table.
-config NFT_EXTHDR
- tristate "Netfilter nf_tables exthdr module"
- help
- This option adds the "exthdr" expression that you can use to match
- IPv6 extension headers and tcp options.
-
-config NFT_META
- tristate "Netfilter nf_tables meta module"
- help
- This option adds the "meta" expression that you can use to match and
- to set packet metainformation such as the packet mark.
-
-config NFT_RT
- tristate "Netfilter nf_tables routing module"
- help
- This option adds the "rt" expression that you can use to match
- packet routing information such as the packet nexthop.
-
config NFT_NUMGEN
tristate "Netfilter nf_tables number generator module"
help
@@ -536,6 +517,15 @@ config NFT_COUNTER
This option adds the "counter" expression that you can use to
include packet and byte counters in a rule.
+config NFT_CONNLIMIT
+ tristate "Netfilter nf_tables connlimit module"
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_CONNCOUNT
+ help
+ This option adds the "connlimit" expression that you can use to
+ ratelimit rule matchings per connections.
+
config NFT_LOG
tristate "Netfilter nf_tables log module"
help
@@ -632,6 +622,15 @@ config NFT_FIB_INET
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_SOCKET
+ tristate "Netfilter nf_tables socket match support"
+ depends on IPV6 || IPV6=n
+ select NF_SOCKET_IPV4
+ select NF_SOCKET_IPV6 if IPV6
+ help
+ This option allows matching for the presence or absence of a
+ corresponding socket and its attributes.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -667,8 +666,7 @@ endif # NF_TABLES
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
- depends on NF_FLOW_TABLE_IPV4
- depends on NF_FLOW_TABLE_IPV6
+ depends on NF_FLOW_TABLE
help
This option adds the flow table mixed IPv4/IPv6 support.
@@ -1000,6 +998,8 @@ config NETFILTER_XT_TARGET_TPROXY
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
+ select NF_TPROXY_IPV4
+ select NF_TPROXY_IPV6 if IP6_NF_IPTABLES
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -1378,6 +1378,7 @@ config NETFILTER_XT_MATCH_NFACCT
config NETFILTER_XT_MATCH_OSF
tristate '"osf" Passive OS fingerprint match'
depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+ select NF_OSF
help
This option selects the Passive OS Fingerprinting match module
that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fd32bd2c9521..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -76,13 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
-obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
-obj-$(CONFIG_NFT_META) += nft_meta.o
-obj-$(CONFIG_NFT_RT) += nft_rt.o
+obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
@@ -104,6 +102,8 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
obj-$(CONFIG_NFT_FIB) += nft_fib.o
obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
+obj-$(CONFIG_NF_OSF) += nf_osf.o
+obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
@@ -111,6 +111,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
+
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
# generic X tables
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 206fb2c4c319..168af54db975 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
continue;
}
- if (reg->nat_hook && orig_ops[i]->nat_hook) {
- kvfree(new);
- return ERR_PTR(-EBUSY);
- }
-
if (inserted || reg->priority > orig_ops[i]->priority) {
new_ops[nhooks] = (void *)orig_ops[i];
new->hooks[nhooks] = old->hooks[i];
@@ -186,9 +181,31 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
#endif
}
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg)
+{
+ struct nf_hook_entries *new_hooks;
+ struct nf_hook_entries *p;
+
+ p = rcu_dereference_raw(*pp);
+ new_hooks = nf_hook_entries_grow(p, reg);
+ if (IS_ERR(new_hooks))
+ return PTR_ERR(new_hooks);
+
+ hooks_validate(new_hooks);
+
+ rcu_assign_pointer(*pp, new_hooks);
+
+ BUG_ON(p == new_hooks);
+ nf_hook_entries_free(p);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);
+
/*
* __nf_hook_entries_try_shrink - try to shrink hook array
*
+ * @old -- current hook blob at @pp
* @pp -- location of hook blob
*
* Hook unregistration must always succeed, so to-be-removed hooks
@@ -201,14 +218,14 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
*
* Returns address to free, or NULL.
*/
-static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
+static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
+ struct nf_hook_entries __rcu **pp)
{
- struct nf_hook_entries *old, *new = NULL;
unsigned int i, j, skip = 0, hook_entries;
+ struct nf_hook_entries *new = NULL;
struct nf_hook_ops **orig_ops;
struct nf_hook_ops **new_ops;
- old = nf_entry_dereference(*pp);
if (WARN_ON_ONCE(!old))
return NULL;
@@ -347,11 +364,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
* This cannot fail, hook unregistration must always succeed.
* Therefore replace the to-be-removed hook with a dummy hook.
*/
-static void nf_remove_net_hook(struct nf_hook_entries *old,
- const struct nf_hook_ops *unreg, int pf)
+static bool nf_remove_net_hook(struct nf_hook_entries *old,
+ const struct nf_hook_ops *unreg)
{
struct nf_hook_ops **orig_ops;
- bool found = false;
unsigned int i;
orig_ops = nf_hook_entries_get_hook_ops(old);
@@ -360,21 +376,10 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
continue;
WRITE_ONCE(old->hooks[i].hook, accept_all);
WRITE_ONCE(orig_ops[i], &dummy_ops);
- found = true;
- break;
+ return true;
}
- if (found) {
-#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
- net_dec_ingress_queue();
-#endif
-#ifdef HAVE_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]);
-#endif
- } else {
- WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum);
- }
+ return false;
}
static void __nf_unregister_net_hook(struct net *net, int pf,
@@ -395,9 +400,19 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
return;
}
- nf_remove_net_hook(p, reg, pf);
+ if (nf_remove_net_hook(p, reg)) {
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ net_dec_ingress_queue();
+#endif
+#ifdef HAVE_JUMP_LABEL
+ static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#endif
+ } else {
+ WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
+ }
- p = __nf_hook_entries_try_shrink(pp);
+ p = __nf_hook_entries_try_shrink(p, pp);
mutex_unlock(&nf_hook_mutex);
if (!p)
return;
@@ -417,6 +432,19 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
}
EXPORT_SYMBOL(nf_unregister_net_hook);
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg)
+{
+ struct nf_hook_entries *p;
+
+ p = rcu_dereference_raw(*pp);
+ if (nf_remove_net_hook(p, reg)) {
+ p = __nf_hook_entries_try_shrink(p, pp);
+ nf_hook_entries_free(p);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);
+
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
int err;
@@ -535,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable);
struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
+struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
@@ -543,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
__rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
+struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_hook);
+
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
void (*attach)(struct sk_buff *, const struct sk_buff *);
@@ -557,17 +591,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
-EXPORT_SYMBOL(nf_ct_destroy);
-
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- void (*destroy)(struct nf_conntrack *);
+ struct nf_ct_hook *ct_hook;
rcu_read_lock();
- destroy = rcu_dereference(nf_ct_destroy);
- BUG_ON(destroy == NULL);
- destroy(nfct);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ BUG_ON(ct_hook == NULL);
+ ct_hook->destroy(nfct);
rcu_read_unlock();
}
EXPORT_SYMBOL(nf_conntrack_destroy);
@@ -580,11 +611,6 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */
-#ifdef CONFIG_NF_NAT_NEEDED
-void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(nf_nat_decode_session_hook);
-#endif
-
static void __net_init
__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
{
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config IP_VS_SH
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_MH
+ tristate "maglev hashing scheduling"
+ ---help---
+ The maglev consistent hashing scheduling algorithm provides the
+ Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+ network connections to the servers through looking up a statically
+ assigned special hash table called the lookup table. Maglev hashing
+ is to assign a preference list of all the lookup table positions
+ to each destination.
+
+ Through this operation, The maglev hashing gives an almost equal
+ share of the lookup table to each of the destinations and provides
+ minimal disruption by using the lookup table. When the set of
+ destinations changes, a connection will likely be sent to the same
+ destination as it was before.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
config IP_VS_SED
tristate "shortest expected delay scheduling"
---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
needs to be large enough to effectively fit all the destinations
multiplied by their respective weights.
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+ int "IPVS maglev hashing table index of size (the prime numbers)"
+ range 8 17
+ default 12
+ ---help---
+ The maglev hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is assigned by a preference
+ list of the positions to each destination until all slots in
+ the table are filled. The index determines the prime for size of
+ the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+ 65521 or 131071. When using weights to allow destinations to
+ receive more connections, the table is assigned an amount
+ proportional to the weights specified. The table needs to be large
+ enough to effectively fit all the destinations multiplied by their
+ respective weights.
+
comment 'IPVS application helper'
config IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index c3db074fc1f7..7588aeaa605f 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_out == NULL)
return 1;
- if (!app->pkt_out(app, cp, skb, &diff))
+ if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns false if it can't handle packet (oom)
*/
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_out(cp, skb, app);
+ return app_tcp_pkt_out(cp, skb, app, ipvsh);
/*
* Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_out == NULL)
return 1;
- return app->pkt_out(app, cp, skb, NULL);
+ return app->pkt_out(app, cp, skb, NULL, ipvsh);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_in == NULL)
return 1;
- if (!app->pkt_in(app, cp, skb, &diff))
+ if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_in(cp, skb, app);
+ return app_tcp_pkt_in(cp, skb, app, ipvsh);
/*
* Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_in == NULL)
return 1;
- return app->pkt_in(app, cp, skb, NULL);
+ return app->pkt_in(app, cp, skb, NULL, ipvsh);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 141b1509c948..0c03c0e16a96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
+ /* keep the last_weight with latest non-0 weight */
+ if (add || udest->weight != 0)
+ atomic_set(&dest->last_weight, udest->weight);
+
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/hash.h>
#include <net/ip_vs.h>
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
@@ -44,9 +46,18 @@
#include <net/ip_vs.h>
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
+enum {
+ IP_VS_FTP_ACTIVE = 0,
+ IP_VS_FTP_PORT = 0,
+ IP_VS_FTP_PASV,
+ IP_VS_FTP_EPRT,
+ IP_VS_FTP_EPSV,
+};
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-/* Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+ if ((th->doff << 2) < sizeof(struct tcphdr))
+ return NULL;
+ return (char *)th + (th->doff << 2);
+}
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
}
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
const char *pattern, size_t plen,
- char skip, char term,
- __be32 *addr, __be16 *port,
- char **start, char **end)
+ char skip, bool ext, int mode,
+ union nf_inet_addr *addr, __be16 *port,
+ __u16 af, char **start, char **end)
{
char *s, c;
unsigned char p[6];
+ char edelim;
+ __u16 hport;
int i = 0;
if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (s == data_limit)
return -1;
if (!found) {
+ /* "(" is optional for non-extended format,
+ * so catch the start of IPv4 address
+ */
+ if (!ext && isdigit(*s))
+ break;
if (*s == skip)
found = 1;
} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
}
}
+ /* Old IPv4-only format? */
+ if (!ext) {
+ p[0] = 0;
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ c = *data;
+ if (isdigit(c)) {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ p[i] = 0;
+ } else {
+ /* unexpected character or terminator */
+ break;
+ }
+ }
- for (data = s; ; data++) {
- if (data == data_limit)
+ if (i != 5)
return -1;
- if (*data == term)
- break;
+
+ *start = s;
+ *end = data;
+ addr->ip = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
}
- *end = data;
+ if (s == data_limit)
+ return -1;
+ *start = s;
+ edelim = *s++;
+ if (edelim < 33 || edelim > 126)
+ return -1;
+ if (s == data_limit)
+ return -1;
+ if (*s == edelim) {
+ /* Address family is usually missing for EPSV response */
+ if (mode != IP_VS_FTP_EPSV)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ /* Then address should be missing too */
+ if (*s != edelim)
+ return -1;
+ /* Caller can pre-set addr, if needed */
+ s++;
+ } else {
+ const char *ep;
- memset(p, 0, sizeof(p));
- for (data = s; ; data++) {
- c = *data;
- if (c == term)
- break;
- if (c >= '0' && c <= '9') {
- p[i] = p[i]*10 + c - '0';
- } else if (c == ',' && i < 5) {
- i++;
- } else {
- /* unexpected character */
+ /* We allow address only from same family */
+ if (af == AF_INET6 && *s != '2')
return -1;
+ if (af == AF_INET && *s != '1')
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (af == AF_INET6) {
+ if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
+ } else {
+ if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
}
+ s = (char *) ep;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
}
-
- if (i != 5)
+ for (hport = 0; ; s++)
+ {
+ if (s == data_limit)
+ return -1;
+ if (!isdigit(*s))
+ break;
+ hport = hport * 10 + *s - '0';
+ }
+ if (s == data_limit || !hport || *s != edelim)
return -1;
-
- *start = s;
- *addr = get_unaligned((__be32 *) p);
- *port = get_unaligned((__be16 *) (p + 4));
+ s++;
+ *end = s;
+ *port = htons(hport);
return 1;
}
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
* client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
* The outgoing packet should be something like
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ * "229 Entering Extended Passive Mode (|||ppp|)"
*/
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_limit;
char *start, *end;
union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- if (cp->app_data == &ip_vs_ftp_pasv) {
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)th + (th->doff << 2);
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
+
if (ip_vs_ftp_get_addrport(data, data_limit,
- SERVER_STRING,
- sizeof(SERVER_STRING)-1,
- '(', ')',
- &from.ip, &port,
+ SERVER_STRING_PASV,
+ sizeof(SERVER_STRING_PASV)-1,
+ '(', false, IP_VS_FTP_PASV,
+ &from, &port, cp->af,
&start, &end) != 1)
return 1;
- IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
&from.ip, ntohs(port), &cp->caddr.ip, 0);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
+ data_limit = skb_tail_pointer(skb);
- /*
- * Now update or create an connection entry for it
+ if (!data || data >= data_limit)
+ return 1;
+
+ /* Usually, data address is not specified but
+ * we support different address, so pre-set it.
*/
- {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &from, port,
- &cp->caddr, 0, &p);
- n_cp = ip_vs_conn_out_get(&p);
- }
- if (!n_cp) {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs,
- AF_INET, IPPROTO_TCP, &cp->caddr,
- 0, &cp->vaddr, port, &p);
- /* As above, this is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
- IP_VS_CONN_F_NO_CPORT |
- IP_VS_CONN_F_NFCT,
- cp->dest, skb->mark);
- if (!n_cp)
- return 0;
+ from = cp->daddr;
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING_EPSV,
+ sizeof(SERVER_STRING_EPSV)-1,
+ '(', true, IP_VS_FTP_EPSV,
+ &from, &port, cp->af,
+ &start, &end) != 1)
+ return 1;
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
- }
+ IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+ } else {
+ return 1;
+ }
- /*
- * Replace the old passive address with the new one
- */
+ /* Now update or create a connection entry for it */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs,
+ cp->af, ipvsh->protocol, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /* Replace the old passive address with the new one */
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
from.ip = n_cp->vaddr.ip;
port = n_cp->vport;
snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
((unsigned char *)&from.ip)[3],
ntohs(port) >> 8,
ntohs(port) & 0xFF);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ from = n_cp->vaddr;
+ port = n_cp->vport;
+ /* Only port, client will use VIP for the data connection */
+ snprintf(buf, sizeof(buf), "|||%u|",
+ ntohs(port));
+ } else {
+ *buf = 0;
+ }
+ buf_len = strlen(buf);
- buf_len = strlen(buf);
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct) {
- bool mangled;
-
- /* If mangling fails this function will return 0
- * which will cause the packet to be dropped.
- * Mangling can only fail under memory pressure,
- * hopefully it will succeed on the retransmitted
- * packet.
- */
- mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
- iph->ihl * 4,
- start - data,
- end - start,
- buf, buf_len);
- if (mangled) {
- ip_vs_nfct_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, 0, 0);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- /* csum is updated */
- ret = 1;
- }
- }
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ bool mangled;
- /*
- * Not setting 'diff' is intentional, otherwise the sequence
- * would be adjusted twice.
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
*/
-
- cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
- ip_vs_conn_put(n_cp);
- return ret;
+ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ ipvsh->len,
+ start - data,
+ end - start,
+ buf, buf_len);
+ if (mangled) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ ipvsh->protocol, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
}
- return 1;
+
+ /* Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
}
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
* (outside-to-inside).
*
* The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* In this case, we create a connection entry using the client address and
* port, so that the active ftp data connection from the server can reach
* the client.
+ * Extended format:
+ * "EPSV\r\n" when client requests server address from same family
+ * "EPSV 1\r\n" when client requests IPv4 server address
+ * "EPSV 2\r\n" when client requests IPv6 server address
+ * "EPSV ALL\r\n" - not supported
+ * EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
*/
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_start, *data_limit;
char *start, *end;
union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/* no diff required for incoming packets */
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- /*
- * Detecting whether it is passive
- */
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Since there may be OPTIONS in the TCP packet and the HLEN is
- the length of the header in 32-bit multiples, it is accurate
- to calculate data address by th+HLEN*4 */
- data = data_start = (char *)th + (th->doff << 2);
+ data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
while (data <= data_limit - 6) {
- if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+ if (cp->af == AF_INET &&
+ strncasecmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(7, "got PASV at %td of %td\n",
data - data_start,
data_limit - data_start);
- cp->app_data = &ip_vs_ftp_pasv;
+ cp->app_data = (void *) IP_VS_FTP_PASV;
return 1;
}
+
+ /* EPSV or EPSV<space><net-prt> */
+ if (strncasecmp(data, "EPSV", 4) == 0 &&
+ (data[4] == ' ' || data[4] == '\r')) {
+ if (data[4] == ' ') {
+ char proto = data[5];
+
+ if (data > data_limit - 7 || data[6] != '\r')
+ return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && proto == '2') {
+ } else
+#endif
+ if (cp->af == AF_INET && proto == '1') {
+ } else {
+ return 1;
+ }
+ }
+ /* Extended Passive mode on */
+ IP_VS_DBG(7, "got EPSV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = (void *) IP_VS_FTP_EPSV;
+ return 1;
+ }
+
data++;
}
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
* then create a new connection entry for the coming data
* connection.
*/
- if (ip_vs_ftp_get_addrport(data_start, data_limit,
- CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- ' ', '\r', &to.ip, &port,
- &start, &end) != 1)
+ if (cp->af == AF_INET &&
+ ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_PORT,
+ sizeof(CLIENT_STRING_PORT)-1,
+ ' ', false, IP_VS_FTP_PORT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip,
+ ntohs(cp->vport)-1);
+ } else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_EPRT,
+ sizeof(CLIENT_STRING_EPRT)-1,
+ ' ', true, IP_VS_FTP_EPRT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport)-1);
+ } else {
return 1;
-
- IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+ }
/* Passive mode off */
- cp->app_data = NULL;
-
- /*
- * Now update or create a connection entry for it
- */
- IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
- ip_vs_proto_name(iph->protocol),
- &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &to, port, &cp->vaddr,
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &to, port, &cp->vaddr,
htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
- /* This is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+ n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
htons(ntohs(cp->dport)-1),
IP_VS_CONN_F_NFCT, cp->dest,
skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %d\n",
+ pr_info("%s: loaded support on port[%d] = %u\n",
app->name, i, ports[i]);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..b9f375e6dc93 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/jiffies.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
}
@@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..542c4949937a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
#include <linux/jiffies.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
}
@@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS: Maglev Hashing scheduling module
+ *
+ * Authors: Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+ unsigned int offset; /* starting offset */
+ unsigned int skip; /* skip */
+ unsigned int perm; /* next_offset */
+ int turns; /* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+ 8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX 12
+#endif
+#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_mh_lookup *lookup;
+ struct ip_vs_mh_dest_setup *dest_setup;
+ hsiphash_key_t hash1, hash2;
+ int gcd;
+ int rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+ hsiphash_key_t *hash2)
+{
+ hash1->key[0] = 2654435761UL;
+ hash1->key[1] = 2654435761UL;
+
+ hash2->key[0] = 2654446892UL;
+ hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+ unsigned int v;
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+ addr->ip6[2] ^ addr->ip6[3];
+#endif
+ v = (offset + ntohs(port) + ntohl(addr_fold));
+ return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+ int i;
+ struct ip_vs_mh_lookup *l;
+ struct ip_vs_dest *dest;
+
+ l = &s->lookup[0];
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(l->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(l->dest, NULL);
+ }
+ l++;
+ }
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest;
+ int lw;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * permutation for the dests.
+ */
+ if (s->gcd < 1)
+ return 0;
+
+ /* Set dest_setup for the dests permutation */
+ p = &svc->destinations;
+ ds = &s->dest_setup[0];
+ while ((p = p->next) != &svc->destinations) {
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash1, 0) %
+ IP_VS_MH_TAB_SIZE;
+ ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash2, 0) %
+ (IP_VS_MH_TAB_SIZE - 1) + 1;
+ ds->perm = ds->offset;
+
+ lw = atomic_read(&dest->last_weight);
+ ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+ ds++;
+ }
+
+ return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int n, c, dt_count;
+ unsigned long *table;
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest, *new_dest;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * the population for the dests and reset lookup table.
+ */
+ if (s->gcd < 1) {
+ ip_vs_mh_reset(s);
+ return 0;
+ }
+
+ table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ p = &svc->destinations;
+ n = 0;
+ dt_count = 0;
+ while (n < IP_VS_MH_TAB_SIZE) {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ ds = &s->dest_setup[0];
+ while (p != &svc->destinations) {
+ /* Ignore added server with zero weight */
+ if (ds->turns < 1) {
+ p = p->next;
+ ds++;
+ continue;
+ }
+
+ c = ds->perm;
+ while (test_bit(c, table)) {
+ /* Add skip, mod IP_VS_MH_TAB_SIZE */
+ ds->perm += ds->skip;
+ if (ds->perm >= IP_VS_MH_TAB_SIZE)
+ ds->perm -= IP_VS_MH_TAB_SIZE;
+ c = ds->perm;
+ }
+
+ __set_bit(c, table);
+
+ dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+ new_dest = list_entry(p, struct ip_vs_dest, n_list);
+ if (dest != new_dest) {
+ if (dest)
+ ip_vs_dest_put(dest);
+ ip_vs_dest_hold(new_dest);
+ RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+ }
+
+ if (++n == IP_VS_MH_TAB_SIZE)
+ goto out;
+
+ if (++dt_count >= ds->turns) {
+ dt_count = 0;
+ p = p->next;
+ ds++;
+ }
+ }
+ }
+
+out:
+ kfree(table);
+ return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+ % IP_VS_MH_TAB_SIZE;
+ struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* First try the dest it's supposed to go to */
+ ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+ &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* If the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+ hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+ roffset) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6,
+ "MH: selected unavailable server %s:%u (offset %u), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int ret;
+
+ if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+ return -EINVAL;
+
+ if (svc->num_dests >= 1) {
+ s->dest_setup = kcalloc(svc->num_dests,
+ sizeof(struct ip_vs_mh_dest_setup),
+ GFP_KERNEL);
+ if (!s->dest_setup)
+ return -ENOMEM;
+ }
+
+ ip_vs_mh_permutate(s, svc);
+
+ ret = ip_vs_mh_populate(s, svc);
+ if (ret < 0)
+ goto out;
+
+ IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+
+out:
+ if (svc->num_dests >= 1) {
+ kfree(s->dest_setup);
+ s->dest_setup = NULL;
+ }
+ return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->last_weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+ int mw, shift;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, return
+ * shift value as zero.
+ */
+ if (gcd < 1)
+ return 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->last_weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ /* Because gcd is greater than zero,
+ * the maximum weight and gcd are always greater than zero
+ */
+ mw = weight / gcd;
+
+ /* shift = occupied bits of weight/gcd - MH highest bits */
+ shift = fls(mw) - IP_VS_MH_TAB_BITS;
+ return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+ struct ip_vs_mh_state *s;
+
+ s = container_of(head, struct ip_vs_mh_state, rcu_head);
+ kfree(s->lookup);
+ kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+ int ret;
+ struct ip_vs_mh_state *s;
+
+ /* Allocate the MH table for this service */
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+ GFP_KERNEL);
+ if (!s->lookup) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
+ generate_hash_secret(&s->hash1, &s->hash2);
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ IP_VS_DBG(6,
+ "MH lookup table (memory=%zdbytes) allocated for current service\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+ /* Assign the lookup table with current dests */
+ ret = ip_vs_mh_reassign(s, svc);
+ if (ret < 0) {
+ ip_vs_mh_reset(s);
+ ip_vs_mh_state_free(&s->rcu_head);
+ return ret;
+ }
+
+ /* No more failures, attach state */
+ svc->sched_data = s;
+ return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ /* Got to clean up lookup entry here */
+ ip_vs_mh_reset(s);
+
+ call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+ IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ /* Assign the lookup table with the updated service */
+ return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *ports;
+
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
+ return 0;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
+ default:
+ return 0;
+ }
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_mh_state *s;
+ __be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+ IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+ port = ip_vs_mh_get_port(skb, iph);
+
+ s = (struct ip_vs_mh_state *)svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+ dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+ else
+ dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
+ ntohs(port),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+ .name = "mh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+ .init_service = ip_vs_mh_init_svc,
+ .done_service = ip_vs_mh_done_svc,
+ .add_dest = ip_vs_mh_dest_changed,
+ .del_dest = ip_vs_mh_dest_changed,
+ .upd_dest = ip_vs_mh_dest_changed,
+ .schedule = ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+ rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
#include <net/netfilter/nf_conntrack_zones.h>
-#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
- &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE "%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \
+ ntohs((T)->src.u.all), \
+ IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \
+ ntohs((T)->dst.u.all), \
(T)->dst.protonum
-#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
- &((C)->vaddr.ip), ntohs((C)->vport), \
- &((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \
+ ntohs((C)->cport), \
+ IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \
+ ntohs((C)->vport), \
+ IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \
+ ntohs((C)->dport), \
(C)->protocol, (C)->state
void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
new_tuple.dst.protonum != IPPROTO_ICMPV6)
new_tuple.dst.u.tcp.port = cp->vport;
}
- IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
- "ctinfo=%d, old reply=" FMT_TUPLE
- ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
- __func__, ct, ct->status, ctinfo,
- ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
- ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&new_tuple));
nf_conntrack_alter_reply(ct, &new_tuple);
+ IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+ __func__, ct, ARG_CONN(cp));
}
int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct ip_vs_conn_param p;
struct net *net = nf_ct_net(ct);
- if (exp->tuple.src.l3num != PF_INET)
- return;
-
/*
* We assume that no NF locks are held before this callback.
* ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_out_get(&p);
if (cp) {
/* Change reply CLIENT->RS to CLIENT->VS */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.dst.u3 = cp->vaddr;
new_reply.dst.u.tcp.port = cp->vport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
- ", inout cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_in_get(&p);
if (cp) {
/* Change reply VS->CLIENT to RS->CLIENT */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.src.u3 = cp->daddr;
new_reply.src.u.tcp.port = cp->dport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", outin cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
- " - unknown expect\n",
- __func__, ct, ct->status, ARG_TUPLE(orig));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
return;
alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
exp->expectfn = ip_vs_nfct_expect_callback;
- IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&exp->tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
nf_ct_expect_related(exp);
nf_ct_expect_put(exp);
}
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
tuple.dst.u3 = cp->vaddr;
tuple.dst.u.all = cp->vport;
- IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
- " for conn " FMT_CONN "\n",
- __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+ __func__, ARG_CONN(cp));
h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_kill(ct)) {
- IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
} else {
- IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
}
nf_ct_put(ct);
} else {
- IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
- __func__, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
}
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_out(cp, skb);
+ ret = ip_vs_app_pkt_out(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_in(cp, skb);
+ ret = ip_vs_app_pkt_in(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
return tcp_state_active_table[state];
}
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
/*
* Call application helper if needed
*/
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+ IP_VS_SH_TAB_BITS)) &
IP_VS_SH_TAB_MASK;
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
/* check and decrement ttl */
if (ipv6_hdr(skb)->hop_limit <= 1) {
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
+
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 153e690e2893..3b5059a8dcdd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
return memcmp(a, b, klen * sizeof(u32));
}
-static bool add_hlist(struct hlist_head *head,
+bool nf_conncount_add(struct hlist_head *head,
const struct nf_conntrack_tuple *tuple)
{
struct nf_conncount_tuple *conn;
@@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head,
hlist_add_head(&conn->node, head);
return true;
}
+EXPORT_SYMBOL_GPL(nf_conncount_add);
-static unsigned int check_hlist(struct net *net,
- struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone,
- bool *addit)
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone,
+ bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn;
@@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net,
return length;
}
+EXPORT_SYMBOL_GPL(nf_conncount_lookup);
static void tree_nodes_free(struct rb_root *root,
struct nf_conncount_rb *gc_nodes[],
@@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root,
} else {
/* same source network -> be counted! */
unsigned int count;
- count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+ count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
+ zone, &addit);
tree_nodes_free(root, gc_nodes, gc_count);
if (!addit)
return count;
- if (!add_hlist(&rbconn->hhead, tuple))
+ if (!nf_conncount_add(&rbconn->hhead, tuple))
return 0; /* hotdrop */
return count + 1;
@@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root,
continue;
/* only used for GC on hhead, retval and 'addit' ignored */
- check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+ nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
if (hlist_empty(&rbconn->hhead))
gc_nodes[gc_count++] = rbconn;
}
@@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
}
EXPORT_SYMBOL_GPL(nf_conncount_init);
-static void destroy_tree(struct rb_root *r)
+void nf_conncount_cache_free(struct hlist_head *hhead)
{
struct nf_conncount_tuple *conn;
- struct nf_conncount_rb *rbconn;
struct hlist_node *n;
+
+ hlist_for_each_entry_safe(conn, n, hhead, node)
+ kmem_cache_free(conncount_conn_cachep, conn);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
+
+static void destroy_tree(struct rb_root *r)
+{
+ struct nf_conncount_rb *rbconn;
struct rb_node *node;
while ((node = rb_first(r)) != NULL) {
@@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r)
rb_erase(node, r);
- hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
- kmem_cache_free(conncount_conn_cachep, conn);
+ nf_conncount_cache_free(&rbconn->hhead);
kmem_cache_free(conncount_rb_cachep, rbconn);
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..3465da2a98bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,11 +58,6 @@
#include "nf_internals.h"
-int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
- enum nf_nat_manip_type manip,
- const struct nlattr *attr) __read_mostly;
-EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
-
__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
EXPORT_SYMBOL_GPL(nf_conntrack_locks);
@@ -186,6 +181,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_t nf_conntrack_generation __read_mostly;
static unsigned int nf_conntrack_hash_rnd __read_mostly;
@@ -1611,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_hook *nat_hook;
+ unsigned int dataoff, status;
+ struct nf_conn *ct;
+ u16 l3num;
+ u8 l4num;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || nf_ct_is_confirmed(ct))
+ return 0;
+
+ l3num = nf_ct_l3num(ct);
+ l3proto = nf_ct_l3proto_find_get(l3num);
+
+ if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+ &l4num) <= 0)
+ return -1;
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+ l4num, net, &tuple, l3proto, l4proto))
+ return -1;
+
+ if (ct->status & IPS_SRC_NAT) {
+ memcpy(tuple.src.u3.all,
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
+ sizeof(tuple.src.u3.all));
+ tuple.src.u.all =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
+ }
+
+ if (ct->status & IPS_DST_NAT) {
+ memcpy(tuple.dst.u3.all,
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
+ sizeof(tuple.dst.u3.all));
+ tuple.dst.u.all =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
+ }
+
+ h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
+ if (!h)
+ return 0;
+
+ /* Store status bits of the conntrack that is clashing to re-do NAT
+ * mangling according to what it has been done already to this packet.
+ */
+ status = ct->status;
+
+ nf_ct_put(ct);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ nf_ct_set(skb, ct, ctinfo);
+
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (!nat_hook)
+ return 0;
+
+ if (status & IPS_SRC_NAT &&
+ nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
+ IP_CT_DIR_ORIGINAL) == NF_DROP)
+ return -1;
+
+ if (status & IPS_DST_NAT &&
+ nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
+ IP_CT_DIR_ORIGINAL) == NF_DROP)
+ return -1;
+
+ return 0;
+}
+
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -1812,8 +1884,7 @@ void nf_conntrack_cleanup_start(void)
void nf_conntrack_cleanup_end(void)
{
- RCU_INIT_POINTER(nf_ct_destroy, NULL);
-
+ RCU_INIT_POINTER(nf_ct_hook, NULL);
cancel_delayed_work_sync(&conntrack_gc_work.dwork);
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
@@ -2130,11 +2201,16 @@ err_cachep:
return ret;
}
+static struct nf_ct_hook nf_conntrack_hook = {
+ .update = nf_conntrack_update,
+ .destroy = destroy_conntrack,
+};
+
void nf_conntrack_init_end(void)
{
/* For use by REJECT target */
RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
- RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+ RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
/*
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index f0e9a7511e1a..a11c304fb771 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
.timeout = 5 * 60,
};
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_ftp_fini(void)
+static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
kfree(ftp_buffer);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 5523acce9d69..4099f4d79bae 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,
static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
static struct nf_conntrack_expect_policy irc_exp_policy;
-static void nf_conntrack_irc_fini(void);
-
static int __init nf_conntrack_irc_init(void)
{
int i, ret;
@@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void)
return 0;
}
-/* This function is intentionally _NOT_ defined as __exit, because
- * it is needed by the init function */
-static void nf_conntrack_irc_fini(void)
+static void __exit nf_conntrack_irc_fini(void)
{
nf_conntrack_helpers_unregister(irc, ports_c);
kfree(irc_buffer);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..39327a42879f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
{
- typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+ struct nf_nat_hook *nat_hook;
int err;
- parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
- if (!parse_nat_setup) {
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (!nat_hook) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
@@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
nfnl_lock(NFNL_SUBSYS_CTNETLINK);
rcu_read_lock();
- if (nfnetlink_parse_nat_setup_hook)
+ if (nat_hook->parse_nat_setup)
return -EAGAIN;
#endif
return -EOPNOTSUPP;
}
- err = parse_nat_setup(ct, manip, attr);
+ err = nat_hook->parse_nat_setup(ct, manip, attr);
if (err == -EAGAIN) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return skb->len;
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index ae457f39d5ce..5072ff96ab33 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
.timeout = 5 * 60,
};
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_sane_fini(void)
+static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
kfree(sane_buffer);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 908e51e2dc2b..c8d2b6688a2a 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1617,7 +1617,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
},
};
-static void nf_conntrack_sip_fini(void)
+static void __exit nf_conntrack_sip_fini(void)
{
nf_conntrack_helpers_unregister(sip, ports_c * 4);
}
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 0ec6779fd5d9..548b673b3625 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
.timeout = 5 * 60,
};
-static void nf_conntrack_tftp_fini(void)
+static void __exit nf_conntrack_tftp_fini(void)
{
nf_conntrack_helpers_unregister(tftp, ports_c * 2);
}
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c
index ec410cae9307..eb0d1658ac05 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -4,6 +4,8 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
@@ -16,6 +18,43 @@ struct flow_offload_entry {
struct rcu_head rcu_head;
};
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+ struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ ft->dir = dir;
+
+ switch (ctt->src.l3num) {
+ case NFPROTO_IPV4:
+ ft->src_v4 = ctt->src.u3.in;
+ ft->dst_v4 = ctt->dst.u3.in;
+ ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
+ break;
+ case NFPROTO_IPV6:
+ ft->src_v6 = ctt->src.u3.in6;
+ ft->dst_v6 = ctt->dst.u3.in6;
+ ft->mtu = ip6_dst_mtu_forward(dst);
+ break;
+ }
+
+ ft->l3proto = ctt->src.l3num;
+ ft->l4proto = ctt->dst.protonum;
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+
+ ft->iifidx = route->tuple[dir].ifindex;
+ ft->oifidx = route->tuple[!dir].ifindex;
+ ft->dst_cache = dst;
+}
+
struct flow_offload *
flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
{
@@ -40,69 +79,12 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
entry->ct = ct;
- switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
- case NFPROTO_IPV4:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
- break;
- case NFPROTO_IPV6:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
- break;
- }
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
- FLOW_OFFLOAD_DIR_ORIGINAL;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
- FLOW_OFFLOAD_DIR_REPLY;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
flow->flags |= FLOW_OFFLOAD_SNAT;
- else if (ct->status & IPS_DST_NAT)
+ if (ct->status & IPS_DST_NAT)
flow->flags |= FLOW_OFFLOAD_DNAT;
return flow;
@@ -118,6 +100,43 @@ err_ct_refcnt:
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
+static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+{
+ tcp->state = TCP_CONNTRACK_ESTABLISHED;
+ tcp->seen[0].td_maxwin = 0;
+ tcp->seen[1].td_maxwin = 0;
+}
+
+static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+{
+ const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ unsigned int *timeouts;
+ unsigned int timeout;
+ int l4num;
+
+ l4num = nf_ct_protonum(ct);
+ if (l4num == IPPROTO_TCP)
+ flow_offload_fixup_tcp(&ct->proto.tcp);
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
+ if (!l4proto)
+ return;
+
+ timeouts = l4proto->get_timeouts(net);
+ if (!timeouts)
+ return;
+
+ if (l4num == IPPROTO_TCP)
+ timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+ else if (l4num == IPPROTO_UDP)
+ timeout = timeouts[UDP_CT_REPLIED];
+ else
+ return;
+
+ ct->timeout = nfct_time_stamp + timeout;
+}
+
void flow_offload_free(struct flow_offload *flow)
{
struct flow_offload_entry *e;
@@ -125,17 +144,46 @@ void flow_offload_free(struct flow_offload *flow)
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
e = container_of(flow, struct flow_offload_entry, flow);
- nf_ct_delete(e->ct, 0, 0);
+ if (flow->flags & FLOW_OFFLOAD_DYING)
+ nf_ct_delete(e->ct, 0, 0);
nf_ct_put(e->ct);
kfree_rcu(e, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
-void flow_offload_dead(struct flow_offload *flow)
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
{
- flow->flags |= FLOW_OFFLOAD_DYING;
+ const struct flow_offload_tuple *tuple = arg->key;
+ const struct flow_offload_tuple_rhash *x = ptr;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ return 1;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
@@ -143,10 +191,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -154,22 +202,51 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
+ struct flow_offload_entry *e;
+
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
flow_offload_free(flow);
}
+void flow_offload_teardown(struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ flow_offload_fixup_ct_state(e->ct);
+}
+EXPORT_SYMBOL_GPL(flow_offload_teardown);
+
struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload_tuple *tuple)
{
- return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
- *flow_table->type->params);
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload *flow;
+ int dir;
+
+ tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+ nf_flow_offload_rhash_params);
+ if (!tuplehash)
+ return NULL;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
+ return NULL;
+
+ return tuplehash;
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
@@ -216,11 +293,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
- return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -248,7 +320,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
if (nf_flow_has_expired(flow) ||
- nf_flow_is_dying(flow))
+ (flow->flags & (FLOW_OFFLOAD_DYING |
+ FLOW_OFFLOAD_TEARDOWN)))
flow_offload_del(flow_table, flow);
}
out:
@@ -258,7 +331,7 @@ out:
return 1;
}
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
@@ -266,42 +339,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
nf_flow_offload_gc_step(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple *tuple = data;
-
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple_rhash *tuplehash = data;
-
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct flow_offload_tuple *tuple = arg->key;
- const struct flow_offload_tuple_rhash *x = ptr;
-
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
- return 1;
-
- return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
- .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
- .hashfn = flow_offload_hash,
- .obj_hashfn = flow_offload_hash_obj,
- .obj_cmpfn = flow_offload_hash_cmp,
- .automatic_shrinking = true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
@@ -419,33 +456,69 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+ int err;
+
+ INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+ err = rhashtable_init(&flowtable->rhashtable,
+ &nf_flow_offload_rhash_params);
+ if (err < 0)
+ return err;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &flowtable->gc_work, HZ);
+
+ mutex_lock(&flowtable_lock);
+ list_add(&flowtable->list, &flowtables);
+ mutex_unlock(&flowtable_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
- if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+ if (!dev) {
+ flow_offload_teardown(flow);
return;
+ }
- flow_offload_dead(flow);
+ if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
+ flow->tuplehash[1].tuple.iifidx == dev->ifindex)
+ flow_offload_dead(flow);
}
static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- void *data)
+ struct net_device *dev)
{
- nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+ nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
}
void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
{
- nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+ struct nf_flowtable *flowtable;
+
+ mutex_lock(&flowtable_lock);
+ list_for_each_entry(flowtable, &flowtables, list)
+ nf_flow_table_iterate_cleanup(flowtable, dev);
+ mutex_unlock(&flowtable_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
+ mutex_lock(&flowtable_lock);
+ list_del(&flow_table->list);
+ mutex_unlock(&flowtable_lock);
+ cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
+ rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
new file mode 100644
index 000000000000..15ed91309992
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,489 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_state_check(struct flow_offload *flow, int proto,
+ struct sk_buff *skb, unsigned int thoff)
+{
+ struct tcphdr *tcph;
+
+ if (proto != IPPROTO_TCP)
+ return 0;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ if (unlikely(tcph->fin || tcph->rst)) {
+ flow_offload_teardown(flow);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+ return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -1;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(ip_has_options(thoff)))
+ return -1;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+ return false;
+
+ return true;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct rtable *rt;
+ unsigned int thoff;
+ struct iphdr *iph;
+ __be32 nexthop;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+ (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ thoff = ip_hdr(skb)->ihl * 4;
+ if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+ return NF_ACCEPT;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ switch (ip6h->nexthdr) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ unsigned int thoff = sizeof(*ip6h);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ return -1;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return -1;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct in6_addr *nexthop;
+ struct ipv6hdr *ip6h;
+ struct rt6_info *rt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ return NF_ACCEPT;
+
+ if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
+ sizeof(*ip6h)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 18f6d7ae995b..e15779fd58e3 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -15,4 +15,9 @@ void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
int __init netfilter_log_init(void);
+/* core.c */
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg);
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+ const struct nf_hook_ops *reg);
#endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..b7df32a56e7e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -32,6 +32,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
+#include "nf_internals.h"
+
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
+static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
static unsigned int nf_nat_hash_rnd __read_mostly;
+struct nf_nat_lookup_hook_priv {
+ struct nf_hook_entries __rcu *entries;
+
+ struct rcu_head rcu_head;
+};
+
+struct nf_nat_hooks_net {
+ struct nf_hook_ops *nat_hook_ops;
+ unsigned int users;
+};
+
+struct nat_net {
+ struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
+};
+
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
{
@@ -157,7 +175,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
static int in_range(const struct nf_nat_l3proto *l3proto,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
* range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +212,7 @@ find_appropriate_src(struct net *net,
const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
- const struct nf_nat_range *range)
+ const struct nf_nat_range2 *range)
{
unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct;
@@ -224,7 +242,7 @@ find_appropriate_src(struct net *net,
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
@@ -298,7 +316,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
@@ -349,9 +367,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
/* Only bother mapping if it's not already in range and unique */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
- if (l4proto->in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
+ l4proto->in_range(tuple, maniptype,
+ &range->min_proto,
+ &range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
goto out;
@@ -360,7 +379,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
}
}
- /* Last change: get protocol to try to obtain unique tuple. */
+ /* Last chance: get protocol to try to obtain unique tuple. */
l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
rcu_read_unlock();
@@ -381,7 +400,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
@@ -459,7 +478,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
(manip == NF_NAT_MANIP_SRC ?
ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
- struct nf_nat_range range = {
+ struct nf_nat_range2 range = {
.flags = NF_NAT_RANGE_MAP_IPS,
.min_addr = ip,
.max_addr = ip,
@@ -474,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
}
EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
+static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
+ enum nf_nat_manip_type mtype,
+ enum ip_conntrack_dir dir)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ l3proto = __nf_nat_l3proto_find(target.src.l3num);
+ l4proto = __nf_nat_l4proto_find(target.src.l3num,
+ target.dst.protonum);
+ if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ return NF_DROP;
+
+ return NF_ACCEPT;
+}
+
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
- const struct nf_nat_l3proto *l3proto;
- const struct nf_nat_l4proto *l4proto;
+ enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int verdict = NF_ACCEPT;
unsigned long statusbit;
- enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
if (mtype == NF_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
@@ -496,21 +534,87 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
statusbit ^= IPS_NAT_MASK;
/* Non-atomic: these bits don't change. */
- if (ct->status & statusbit) {
- struct nf_conntrack_tuple target;
+ if (ct->status & statusbit)
+ verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
+
+ return verdict;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
- /* We are aiming to look like inverse of other direction. */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ nat = nfct_nat(ct);
- l3proto = __nf_nat_l3proto_find(target.src.l3num);
- l4proto = __nf_nat_l4proto_find(target.src.l3num,
- target.dst.protonum);
- if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
- return NF_DROP;
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ struct nf_nat_lookup_hook_priv *lpriv = priv;
+ struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
+ unsigned int ret;
+ int i;
+
+ if (!e)
+ goto null_bind;
+
+ for (i = 0; i < e->num_hook_entries; i++) {
+ ret = e->hooks[i].hook(e->hooks[i].priv, skb,
+ state);
+ if (ret != NF_ACCEPT)
+ return ret;
+ if (nf_nat_initialized(ct, maniptype))
+ goto do_nat;
+ }
+null_bind:
+ ret = nf_nat_alloc_null_binding(ct, state->hook);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct, ct->status);
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat,
+ state->out))
+ goto oif_changed;
+ }
+ break;
+ default:
+ /* ESTABLISHED */
+ WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
+ goto oif_changed;
}
- return NF_ACCEPT;
+do_nat:
+ return nf_nat_packet(ct, ctinfo, state->hook, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
}
-EXPORT_SYMBOL_GPL(nf_nat_packet);
+EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
struct nf_nat_proto_clean {
u8 l3proto;
@@ -702,7 +806,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
static int nfnetlink_parse_nat_proto(struct nlattr *attr,
const struct nf_conn *ct,
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
struct nlattr *tb[CTA_PROTONAT_MAX+1];
const struct nf_nat_l4proto *l4proto;
@@ -730,7 +834,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
static int
nfnetlink_parse_nat(const struct nlattr *nat,
- const struct nf_conn *ct, struct nf_nat_range *range,
+ const struct nf_conn *ct, struct nf_nat_range2 *range,
const struct nf_nat_l3proto *l3proto)
{
struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +862,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
const struct nf_nat_l3proto *l3proto;
int err;
@@ -800,6 +904,146 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
.expectfn = nf_nat_follow_master,
};
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+ const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
+{
+ struct nat_net *nat_net = net_generic(net, nat_net_id);
+ struct nf_nat_hooks_net *nat_proto_net;
+ struct nf_nat_lookup_hook_priv *priv;
+ unsigned int hooknum = ops->hooknum;
+ struct nf_hook_ops *nat_ops;
+ int i, ret;
+
+ if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+ return -EINVAL;
+
+ nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+ for (i = 0; i < ops_count; i++) {
+ if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
+ return -EINVAL;
+ if (orig_nat_ops[i].hooknum == hooknum) {
+ hooknum = i;
+ break;
+ }
+ }
+
+ if (WARN_ON_ONCE(i == ops_count))
+ return -EINVAL;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (!nat_proto_net->nat_hook_ops) {
+ WARN_ON(nat_proto_net->users != 0);
+
+ nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+ if (!nat_ops) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ops_count; i++) {
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ nat_ops[i].priv = priv;
+ continue;
+ }
+ mutex_unlock(&nf_nat_proto_mutex);
+ while (i)
+ kfree(nat_ops[--i].priv);
+ kfree(nat_ops);
+ return -ENOMEM;
+ }
+
+ ret = nf_register_net_hooks(net, nat_ops, ops_count);
+ if (ret < 0) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ for (i = 0; i < ops_count; i++)
+ kfree(nat_ops[i].priv);
+ kfree(nat_ops);
+ return ret;
+ }
+
+ nat_proto_net->nat_hook_ops = nat_ops;
+ }
+
+ nat_ops = nat_proto_net->nat_hook_ops;
+ priv = nat_ops[hooknum].priv;
+ if (WARN_ON_ONCE(!priv)) {
+ mutex_unlock(&nf_nat_proto_mutex);
+ return -EOPNOTSUPP;
+ }
+
+ ret = nf_hook_entries_insert_raw(&priv->entries, ops);
+ if (ret == 0)
+ nat_proto_net->users++;
+
+ mutex_unlock(&nf_nat_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_register_fn);
+
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+ unsigned int ops_count)
+{
+ struct nat_net *nat_net = net_generic(net, nat_net_id);
+ struct nf_nat_hooks_net *nat_proto_net;
+ struct nf_nat_lookup_hook_priv *priv;
+ struct nf_hook_ops *nat_ops;
+ int hooknum = ops->hooknum;
+ int i;
+
+ if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+ return;
+
+ nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (WARN_ON(nat_proto_net->users == 0))
+ goto unlock;
+
+ nat_proto_net->users--;
+
+ nat_ops = nat_proto_net->nat_hook_ops;
+ for (i = 0; i < ops_count; i++) {
+ if (nat_ops[i].hooknum == hooknum) {
+ hooknum = i;
+ break;
+ }
+ }
+ if (WARN_ON_ONCE(i == ops_count))
+ goto unlock;
+ priv = nat_ops[hooknum].priv;
+ nf_hook_entries_delete_raw(&priv->entries, ops);
+
+ if (nat_proto_net->users == 0) {
+ nf_unregister_net_hooks(net, nat_ops, ops_count);
+
+ for (i = 0; i < ops_count; i++) {
+ priv = nat_ops[i].priv;
+ kfree_rcu(priv, rcu_head);
+ }
+
+ nat_proto_net->nat_hook_ops = NULL;
+ kfree(nat_ops);
+ }
+unlock:
+ mutex_unlock(&nf_nat_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
+
+static struct pernet_operations nat_net_ops = {
+ .id = &nat_net_id,
+ .size = sizeof(struct nat_net),
+};
+
+static struct nf_nat_hook nat_hook = {
+ .parse_nat_setup = nfnetlink_parse_nat_setup,
+#ifdef CONFIG_XFRM
+ .decode_session = __nf_nat_decode_session,
+#endif
+ .manip_pkt = nf_nat_manip_pkt,
+};
+
static int __init nf_nat_init(void)
{
int ret, i;
@@ -823,15 +1067,17 @@ static int __init nf_nat_init(void)
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
+ ret = register_pernet_subsys(&nat_net_ops);
+ if (ret < 0) {
+ nf_ct_extend_unregister(&nat_extend);
+ return ret;
+ }
+
nf_ct_helper_expectfn_register(&follow_master_nat);
- BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
- RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
- nfnetlink_parse_nat_setup);
-#ifdef CONFIG_XFRM
- BUG_ON(nf_nat_decode_session_hook != NULL);
- RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
-#endif
+ WARN_ON(nf_nat_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
+
return 0;
}
@@ -844,16 +1090,15 @@ static void __exit nf_nat_cleanup(void)
nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
- RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-#ifdef CONFIG_XFRM
- RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
-#endif
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+
synchronize_rcu();
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ unregister_pernet_subsys(&nat_net_ops);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
void nf_nat_follow_master(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct,
u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
: tuple->src.u.all);
} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
off = prandom_u32();
+ } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
+ off = (ntohs(*portptr) - ntohs(range->base_proto.all));
} else {
off = *rover;
}
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
*portptr = htons(min + off % range_size);
if (++i != range_size && nf_nat_used_tuple(tuple, ct))
continue;
- if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+ if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
+ NF_NAT_RANGE_PROTO_OFFSET)))
*rover = off;
return;
}
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
+ struct nf_nat_range2 *range)
{
if (tb[CTA_PROTONAT_PORT_MIN]) {
range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
static void
dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
static void
sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
static void
tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
static void
udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
static void
udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..adee04af8d43 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -15,7 +15,6 @@
#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/types.h>
@@ -36,7 +35,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
__be32 newdst;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +81,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
struct in6_addr newdst;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
static void nf_nat_sip_expected(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nf_osf.h>
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+ const struct nf_osf_info *info,
+ unsigned char f_ttl)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+
+ if (info->flags & NF_OSF_TTL) {
+ if (info->ttl == NF_OSF_TTL_TRUE)
+ return ip->ttl == f_ttl;
+ if (info->ttl == NF_OSF_TTL_NOCHECK)
+ return 1;
+ else if (ip->ttl <= f_ttl)
+ return 1;
+ else {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int ret = 0;
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(ip->saddr, ifa)) {
+ ret = (ip->ttl == f_ttl);
+ break;
+ }
+ }
+ endfor_ifa(in_dev);
+
+ return ret;
+ }
+ }
+
+ return ip->ttl == f_ttl;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+ int hooknum, struct net_device *in, struct net_device *out,
+ const struct nf_osf_info *info, struct net *net,
+ const struct list_head *nf_osf_fingers)
+{
+ const unsigned char *optp = NULL, *_optp = NULL;
+ unsigned int optsize = 0, check_WSS = 0;
+ int fmatch = FMATCH_WRONG, fcount = 0;
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ u16 window, totlen, mss = 0;
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ bool df;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ if (!tcp)
+ return false;
+
+ if (!tcp->syn)
+ return false;
+
+ totlen = ntohs(ip->tot_len);
+ df = ntohs(ip->frag_off) & IP_DF;
+ window = ntohs(tcp->window);
+
+ if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+ optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+ _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+ sizeof(struct tcphdr), optsize, opts);
+ }
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+ int foptsize, optnum;
+
+ f = &kf->finger;
+
+ if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+ continue;
+
+ optp = _optp;
+ fmatch = FMATCH_WRONG;
+
+ if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+ continue;
+
+ /*
+ * Should not happen if userspace parser was written correctly.
+ */
+ if (f->wss.wc >= OSF_WSS_MAX)
+ continue;
+
+ /* Check options */
+
+ foptsize = 0;
+ for (optnum = 0; optnum < f->opt_num; ++optnum)
+ foptsize += f->opt[optnum].length;
+
+ if (foptsize > MAX_IPOPTLEN ||
+ optsize > MAX_IPOPTLEN ||
+ optsize != foptsize)
+ continue;
+
+ check_WSS = f->wss.wc;
+
+ for (optnum = 0; optnum < f->opt_num; ++optnum) {
+ if (f->opt[optnum].kind == (*optp)) {
+ __u32 len = f->opt[optnum].length;
+ const __u8 *optend = optp + len;
+
+ fmatch = FMATCH_OK;
+
+ switch (*optp) {
+ case OSFOPT_MSS:
+ mss = optp[3];
+ mss <<= 8;
+ mss |= optp[2];
+
+ mss = ntohs((__force __be16)mss);
+ break;
+ case OSFOPT_TS:
+ break;
+ }
+
+ optp = optend;
+ } else
+ fmatch = FMATCH_OPT_WRONG;
+
+ if (fmatch != FMATCH_OK)
+ break;
+ }
+
+ if (fmatch != FMATCH_OPT_WRONG) {
+ fmatch = FMATCH_WRONG;
+
+ switch (check_WSS) {
+ case OSF_WSS_PLAIN:
+ if (f->wss.val == 0 || window == f->wss.val)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MSS:
+ /*
+ * Some smart modems decrease mangle MSS to
+ * SMART_MSS_2, so we check standard, decreased
+ * and the one provided in the fingerprint MSS
+ * values.
+ */
+#define SMART_MSS_1 1460
+#define SMART_MSS_2 1448
+ if (window == f->wss.val * mss ||
+ window == f->wss.val * SMART_MSS_1 ||
+ window == f->wss.val * SMART_MSS_2)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MTU:
+ if (window == f->wss.val * (mss + 40) ||
+ window == f->wss.val * (SMART_MSS_1 + 40) ||
+ window == f->wss.val * (SMART_MSS_2 + 40))
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MODULO:
+ if ((window % f->wss.val) == 0)
+ fmatch = FMATCH_OK;
+ break;
+ }
+ }
+
+ if (fmatch != FMATCH_OK)
+ continue;
+
+ fcount++;
+
+ if (info->flags & NF_OSF_LOG)
+ nf_log_packet(net, family, hooknum, skb,
+ in, out, NULL,
+ "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+ f->genre, f->version, f->subtype,
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest),
+ f->ttl - ip->ttl);
+
+ if ((info->flags & NF_OSF_LOG) &&
+ info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+ break;
+ }
+
+ if (!fcount && (info->flags & NF_OSF_LOG))
+ nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+ "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest));
+
+ if (fcount)
+ fmatch = FMATCH_OK;
+
+ return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 501e48a7965b..ca4c4d994ddb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,42 @@ static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static u64 table_handle;
+enum {
+ NFT_VALIDATE_SKIP = 0,
+ NFT_VALIDATE_NEED,
+ NFT_VALIDATE_DO,
+};
+
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
+
+static const struct rhashtable_params nft_chain_ht_params = {
+ .head_offset = offsetof(struct nft_chain, rhlhead),
+ .key_offset = offsetof(struct nft_chain, name),
+ .hashfn = nft_chain_hash,
+ .obj_hashfn = nft_chain_hash_obj,
+ .obj_cmpfn = nft_chain_hash_cmp,
+ .locks_mul = 1,
+ .automatic_shrinking = true,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+ switch (net->nft.validate_state) {
+ case NFT_VALIDATE_SKIP:
+ WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+ break;
+ case NFT_VALIDATE_NEED:
+ break;
+ case NFT_VALIDATE_DO:
+ if (new_validate_state == NFT_VALIDATE_NEED)
+ return;
+ }
+
+ net->nft.validate_state = new_validate_state;
+}
+
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
const struct sk_buff *skb,
@@ -74,88 +110,43 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
-/* removal requests are queued in the commit_list, but not acted upon
- * until after all new rules are in place.
- *
- * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
- * nf_unregister_net_hook().
- *
- * nf_register_net_hook thus fails if a nat hook is already in place
- * even if the conflicting hook is about to be removed.
- *
- * If collision is detected, search commit_log for DELCHAIN matching
- * the new nat hooknum; if we find one collision is temporary:
- *
- * Either transaction is aborted (new/colliding hook is removed), or
- * transaction is committed (old hook is removed).
- */
-static bool nf_tables_allow_nat_conflict(const struct net *net,
- const struct nf_hook_ops *ops)
-{
- const struct nft_trans *trans;
- bool ret = false;
-
- if (!ops->nat_hook)
- return false;
-
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- const struct nf_hook_ops *pending_ops;
- const struct nft_chain *pending;
-
- if (trans->msg_type != NFT_MSG_NEWCHAIN &&
- trans->msg_type != NFT_MSG_DELCHAIN)
- continue;
-
- pending = trans->ctx.chain;
- if (!nft_is_base_chain(pending))
- continue;
-
- pending_ops = &nft_base_chain(pending)->ops;
- if (pending_ops->nat_hook &&
- pending_ops->pf == ops->pf &&
- pending_ops->hooknum == ops->hooknum) {
- /* other hook registration already pending? */
- if (trans->msg_type == NFT_MSG_NEWCHAIN)
- return false;
-
- ret = true;
- }
- }
-
- return ret;
-}
-
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
- struct nf_hook_ops *ops;
- int ret;
+ const struct nft_base_chain *basechain;
+ const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
!nft_is_base_chain(chain))
return 0;
- ops = &nft_base_chain(chain)->ops;
- ret = nf_register_net_hook(net, ops);
- if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
- ops->nat_hook = false;
- ret = nf_register_net_hook(net, ops);
- ops->nat_hook = true;
- }
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
- return ret;
+ if (basechain->type->ops_register)
+ return basechain->type->ops_register(net, ops);
+
+ return nf_register_net_hook(net, ops);
}
static void nf_tables_unregister_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
+ const struct nft_base_chain *basechain;
+ const struct nf_hook_ops *ops;
+
if (table->flags & NFT_TABLE_F_DORMANT ||
!nft_is_base_chain(chain))
return;
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
+
+ if (basechain->type->ops_unregister)
+ return basechain->type->ops_unregister(net, ops);
- nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+ nf_unregister_net_hook(net, ops);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -415,13 +406,17 @@ static struct nft_table *nft_table_lookup(const struct net *net,
{
struct nft_table *table;
- list_for_each_entry(table, &net->nft.tables, list) {
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry_rcu(table, &net->nft.tables, list) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
nft_active_genmask(table, genmask))
return table;
}
- return NULL;
+
+ return ERR_PTR(-ENOENT);
}
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
@@ -435,37 +430,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
nft_active_genmask(table, genmask))
return table;
}
- return NULL;
-}
-
-static struct nft_table *nf_tables_table_lookup(const struct net *net,
- const struct nlattr *nla,
- u8 family, u8 genmask)
-{
- struct nft_table *table;
-
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
- table = nft_table_lookup(net, nla, family, genmask);
- if (table != NULL)
- return table;
-
- return ERR_PTR(-ENOENT);
-}
-
-static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
- const struct nlattr *nla,
- u8 genmask)
-{
- struct nft_table *table;
-
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
- table = nft_table_lookup_byhandle(net, nla, genmask);
- if (table != NULL)
- return table;
return ERR_PTR(-ENOENT);
}
@@ -618,6 +582,24 @@ done:
return skb->len;
}
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *c)
+{
+ int err;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
+ err = netlink_dump_start(nlsk, skb, nlh, c);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ return err;
+}
+
+/* called with rcu_read_lock held */
static int nf_tables_gettable(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -633,16 +615,19 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_tables,
+ .module = THIS_MODULE,
};
- return netlink_dump_start(nlsk, skb, nlh, &c);
+
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
return PTR_ERR(table);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -749,6 +734,29 @@ err:
return ret;
}
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
+{
+ const char *name = data;
+
+ return jhash(name, strlen(name), seed);
+}
+
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct nft_chain *chain = data;
+
+ return nft_chain_hash(chain->name, 0, seed);
+}
+
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nft_chain *chain = ptr;
+ const char *name = arg->key;
+
+ return strcmp(chain->name, name);
+}
+
static int nf_tables_newtable(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -756,21 +764,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
- const struct nlattr *name;
- struct nft_table *table;
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
+ struct nft_table *table;
u32 flags = 0;
struct nft_ctx ctx;
int err;
- name = nla[NFTA_TABLE_NAME];
- table = nf_tables_table_lookup(net, name, family, genmask);
+ attr = nla[NFTA_TABLE_NAME];
+ table = nft_table_lookup(net, attr, family, genmask);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
@@ -789,10 +799,14 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(name, GFP_KERNEL);
+ table->name = nla_strdup(attr, GFP_KERNEL);
if (table->name == NULL)
goto err_strdup;
+ err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
+ if (err)
+ goto err_chain_ht;
+
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
INIT_LIST_HEAD(&table->objects);
@@ -809,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
list_add_tail_rcu(&table->list, &net->nft.tables);
return 0;
err_trans:
+ rhltable_destroy(&table->chains_ht);
+err_chain_ht:
kfree(table->name);
err_strdup:
kfree(table);
@@ -912,8 +928,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
- struct nft_table *table;
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
+ struct nft_table *table;
struct nft_ctx ctx;
nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
@@ -921,16 +938,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
(!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
- if (nla[NFTA_TABLE_HANDLE])
- table = nf_tables_table_lookup_byhandle(net,
- nla[NFTA_TABLE_HANDLE],
- genmask);
- else
- table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
- family, genmask);
+ if (nla[NFTA_TABLE_HANDLE]) {
+ attr = nla[NFTA_TABLE_HANDLE];
+ table = nft_table_lookup_byhandle(net, attr, genmask);
+ } else {
+ attr = nla[NFTA_TABLE_NAME];
+ table = nft_table_lookup(net, attr, family, genmask);
+ }
- if (IS_ERR(table))
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(table);
+ }
if (nlh->nlmsg_flags & NLM_F_NONREC &&
table->use > 0)
@@ -946,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
{
BUG_ON(ctx->table->use > 0);
+ rhltable_destroy(&ctx->table->chains_ht);
kfree(ctx->table->name);
kfree(ctx->table);
}
@@ -978,8 +998,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
*/
static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
- u8 genmask)
+nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
{
struct nft_chain *chain;
@@ -992,22 +1011,35 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
return ERR_PTR(-ENOENT);
}
-static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u8 genmask)
+static struct nft_chain *nft_chain_lookup(struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
+ char search[NFT_CHAIN_MAXNAMELEN + 1];
+ struct rhlist_head *tmp, *list;
struct nft_chain *chain;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry(chain, &table->chains, list) {
- if (!nla_strcmp(nla, chain->name) &&
- nft_active_genmask(chain, genmask))
- return chain;
- }
+ nla_strlcpy(search, nla, sizeof(search));
- return ERR_PTR(-ENOENT);
+ WARN_ON(!rcu_read_lock_held() &&
+ !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+ chain = ERR_PTR(-ENOENT);
+ rcu_read_lock();
+ list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
+ if (!list)
+ goto out_unlock;
+
+ rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
+ if (nft_active_genmask(chain, genmask))
+ goto out_unlock;
+ }
+ chain = ERR_PTR(-ENOENT);
+out_unlock:
+ rcu_read_unlock();
+ return chain;
}
static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
@@ -1203,6 +1235,7 @@ done:
return skb->len;
}
+/* called with rcu_read_lock held */
static int nf_tables_getchain(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -1210,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_cur(net);
- const struct nft_table *table;
const struct nft_chain *chain;
+ struct nft_table *table;
struct sk_buff *skb2;
int family = nfmsg->nfgen_family;
int err;
@@ -1219,20 +1252,25 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_chains,
+ .module = THIS_MODULE,
};
- return netlink_dump_start(nlsk, skb, nlh, &c);
+
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return PTR_ERR(chain);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -1304,17 +1342,32 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
}
}
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+ struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+ struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+ if (g0 != g1)
+ kvfree(g1);
+ kvfree(g0);
+
+ /* should be NULL either via abort or via successful commit */
+ WARN_ON_ONCE(chain->rules_next);
+ kvfree(chain->rules_next);
+}
+
static void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
BUG_ON(chain->use > 0);
+ /* no concurrent access possible anymore */
+ nf_tables_chain_free_chain_rules(chain);
+
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (basechain->type->free)
- basechain->type->free(ctx);
module_put(basechain->type->owner);
free_percpu(basechain->stats);
if (basechain->stats)
@@ -1404,6 +1457,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
module_put(hook->type->owner);
}
+struct nft_rules_old {
+ struct rcu_head h;
+ struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+ unsigned int alloc)
+{
+ if (alloc > INT_MAX)
+ return NULL;
+
+ alloc += 1; /* NULL, ends rules */
+ if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ return NULL;
+
+ alloc *= sizeof(struct nft_rule *);
+ alloc += sizeof(struct nft_rules_old);
+
+ return kvmalloc(alloc, GFP_KERNEL);
+}
+
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
u8 policy, bool create)
{
@@ -1413,6 +1487,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
struct nft_stats __percpu *stats;
struct net *net = ctx->net;
struct nft_chain *chain;
+ struct nft_rule **rules;
int err;
if (table->use == UINT_MAX)
@@ -1447,9 +1522,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
basechain->type = hook.type;
- if (basechain->type->init)
- basechain->type->init(ctx);
-
chain = &basechain->chain;
ops = &basechain->ops;
@@ -1460,9 +1532,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
ops->hook = hook.type->hooks[ops->hooknum];
ops->dev = hook.dev;
- if (basechain->type->type == NFT_CHAIN_T_NAT)
- ops->nat_hook = true;
-
chain->flags |= NFT_BASE_CHAIN;
basechain->policy = policy;
} else {
@@ -1481,13 +1550,31 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
goto err1;
}
+ rules = nf_tables_chain_alloc_rules(chain, 0);
+ if (!rules) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ *rules = NULL;
+ rcu_assign_pointer(chain->rules_gen_0, rules);
+ rcu_assign_pointer(chain->rules_gen_1, rules);
+
err = nf_tables_register_hook(net, table, chain);
if (err < 0)
goto err1;
+ err = rhltable_insert_key(&table->chains_ht, chain->name,
+ &chain->rhlhead, nft_chain_ht_params);
+ if (err)
+ goto err2;
+
err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
- if (err < 0)
+ if (err < 0) {
+ rhltable_remove(&table->chains_ht, &chain->rhlhead,
+ nft_chain_ht_params);
goto err2;
+ }
table->use++;
list_add_tail_rcu(&chain->list, &table->chains);
@@ -1544,8 +1631,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nla[NFTA_CHAIN_NAME]) {
struct nft_chain *chain2;
- chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
- genmask);
+ chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
if (!IS_ERR(chain2))
return -EEXIST;
}
@@ -1595,9 +1681,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
struct netlink_ext_ack *extack)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- const struct nlattr * uninitialized_var(name);
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
u8 policy = NF_ACCEPT;
@@ -1607,36 +1693,46 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
chain = NULL;
- name = nla[NFTA_CHAIN_NAME];
+ attr = nla[NFTA_CHAIN_NAME];
if (nla[NFTA_CHAIN_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
- chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup_byhandle(table, handle, genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
return PTR_ERR(chain);
+ }
+ attr = nla[NFTA_CHAIN_HANDLE];
} else {
- chain = nf_tables_chain_lookup(table, name, genmask);
+ chain = nft_chain_lookup(table, attr, genmask);
if (IS_ERR(chain)) {
- if (PTR_ERR(chain) != -ENOENT)
+ if (PTR_ERR(chain) != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
+ }
chain = NULL;
}
}
if (nla[NFTA_CHAIN_POLICY]) {
if (chain != NULL &&
- !nft_is_base_chain(chain))
+ !nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
+ }
if (chain == NULL &&
- nla[NFTA_CHAIN_HOOK] == NULL)
+ nla[NFTA_CHAIN_HOOK] == NULL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
return -EOPNOTSUPP;
+ }
policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
switch (policy) {
@@ -1651,8 +1747,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
if (chain != NULL) {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
@@ -1669,28 +1767,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule;
- int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
u64 handle;
u32 use;
int err;
- table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
+ }
if (nla[NFTA_CHAIN_HANDLE]) {
- handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
- chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+ attr = nla[NFTA_CHAIN_HANDLE];
+ handle = be64_to_cpu(nla_get_be64(attr));
+ chain = nft_chain_lookup_byhandle(table, handle, genmask);
} else {
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ attr = nla[NFTA_CHAIN_NAME];
+ chain = nft_chain_lookup(table, attr, genmask);
}
- if (IS_ERR(chain))
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
+ }
if (nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
@@ -1712,8 +1816,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
/* There are rules and elements that are still holding references to us,
* we cannot do a recursive removal in this case.
*/
- if (use > 0)
+ if (use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
return nft_delchain(&ctx);
}
@@ -1905,19 +2011,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
goto err1;
}
- if (ops->validate) {
- const struct nft_data *data = NULL;
-
- err = ops->validate(ctx, expr, &data);
- if (err < 0)
- goto err2;
- }
-
return 0;
-
-err2:
- if (ops->destroy)
- ops->destroy(ctx, expr);
err1:
expr->ops = NULL;
return err;
@@ -1970,13 +2064,13 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
* Rules
*/
-static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
- u64 handle)
+static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+ u64 handle)
{
struct nft_rule *rule;
// FIXME: this sucks
- list_for_each_entry(rule, &chain->rules, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
if (handle == rule->handle)
return rule;
}
@@ -1984,13 +2078,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
return ERR_PTR(-ENOENT);
}
-static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
- const struct nlattr *nla)
+static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+ const struct nlattr *nla)
{
if (nla == NULL)
return ERR_PTR(-EINVAL);
- return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+ return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2172,6 +2266,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
return 0;
}
+/* called with rcu_read_lock held */
static int nf_tables_getrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2179,9 +2274,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_cur(net);
- const struct nft_table *table;
const struct nft_chain *chain;
const struct nft_rule *rule;
+ struct nft_table *table;
struct sk_buff *skb2;
int family = nfmsg->nfgen_family;
int err;
@@ -2190,18 +2285,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_rules,
.done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
struct nft_rule_dump_ctx *ctx;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
if (!ctx)
return -ENOMEM;
if (nla[NFTA_RULE_TABLE]) {
ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!ctx->table) {
kfree(ctx);
return -ENOMEM;
@@ -2209,7 +2305,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
}
if (nla[NFTA_RULE_CHAIN]) {
ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!ctx->chain) {
kfree(ctx->table);
kfree(ctx);
@@ -2219,23 +2315,28 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
c.data = ctx;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
- rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
- if (IS_ERR(rule))
+ rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -2276,6 +2377,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
nf_tables_rule_destroy(ctx, rule);
}
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+ struct nft_expr *expr, *last;
+ const struct nft_data *data;
+ struct nft_rule *rule;
+ int err;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(ctx->net, rule))
+ continue;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (!expr->ops->validate)
+ continue;
+
+ err = expr->ops->validate(ctx, expr, &data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+ struct nft_chain *chain;
+ struct nft_ctx ctx = {
+ .net = net,
+ .family = table->family,
+ };
+ int err;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain))
+ continue;
+
+ ctx.chain = chain;
+ err = nft_chain_validate(&ctx, chain);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
#define NFT_RULE_MAXEXPRS 128
static struct nft_expr_info *info;
@@ -2303,23 +2451,30 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
- rule = __nf_tables_rule_lookup(chain, handle);
- if (IS_ERR(rule))
+ rule = __nft_rule_lookup(chain, handle);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
@@ -2338,9 +2493,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return -EOPNOTSUPP;
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nf_tables_rule_lookup(chain, pos_handle);
- if (IS_ERR(old_rule))
+ old_rule = __nft_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
+ }
}
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
@@ -2394,6 +2551,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
err = nf_tables_newexpr(&ctx, &info[i], expr);
if (err < 0)
goto err2;
+
+ if (info[i].ops->validate)
+ nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
info[i].ops = NULL;
expr = nft_expr_next(expr);
}
@@ -2437,8 +2598,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
}
chain->use++;
- return 0;
+ if (net->nft.validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
+
+ return 0;
err2:
nf_tables_rule_release(&ctx, rule);
err1:
@@ -2478,32 +2642,37 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
int family = nfmsg->nfgen_family, err = 0;
struct nft_ctx ctx;
- table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
+ }
if (nla[NFTA_RULE_CHAIN]) {
- chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
- genmask);
- if (IS_ERR(chain))
+ chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
+ }
}
nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
- rule = nf_tables_rule_lookup(chain,
- nla[NFTA_RULE_HANDLE]);
- if (IS_ERR(rule))
+ rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
+ }
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
- if (IS_ERR(rule))
+ if (IS_ERR(rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
+ }
err = nft_delrule(&ctx, rule);
} else {
@@ -2548,14 +2717,12 @@ void nft_unregister_set(struct nft_set_type *type)
EXPORT_SYMBOL_GPL(nft_unregister_set);
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
- NFT_SET_TIMEOUT | NFT_SET_OBJECT)
+ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
+ NFT_SET_EVAL)
-static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags)
+static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
{
- if ((flags & NFT_SET_EVAL) && !ops->update)
- return false;
-
- return (flags & ops->features) == (flags & NFT_SET_FEATURES);
+ return (flags & type->features) == (flags & NFT_SET_FEATURES);
}
/*
@@ -2592,14 +2759,9 @@ nft_select_set_ops(const struct nft_ctx *ctx,
best.space = ~0;
list_for_each_entry(type, &nf_tables_set_types, list) {
- if (!type->select_ops)
- ops = type->ops;
- else
- ops = type->select_ops(ctx, desc, flags);
- if (!ops)
- continue;
+ ops = &type->ops;
- if (!nft_set_ops_candidate(ops, flags))
+ if (!nft_set_ops_candidate(type, flags))
continue;
if (!ops->estimate(desc, flags, &est))
continue;
@@ -2630,7 +2792,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
if (!try_module_get(type->owner))
continue;
if (bops != NULL)
- module_put(bops->type->owner);
+ module_put(to_set_type(bops)->owner);
bops = ops;
best = est;
@@ -2671,6 +2833,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
+ struct netlink_ext_ack *extack,
u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2678,25 +2841,27 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
struct nft_table *table = NULL;
if (nla[NFTA_SET_TABLE] != NULL) {
- table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
+ }
}
nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
return 0;
}
-static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry(set, &table->sets, list) {
+ list_for_each_entry_rcu(set, &table->sets, list) {
if (!nla_strcmp(nla, set->name) &&
nft_active_genmask(set, genmask))
return set;
@@ -2704,14 +2869,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
return ERR_PTR(-ENOENT);
}
-static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla,
+ u8 genmask)
{
struct nft_set *set;
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
list_for_each_entry(set, &table->sets, list) {
if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
nft_active_genmask(set, genmask))
@@ -2720,9 +2883,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab
return ERR_PTR(-ENOENT);
}
-static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
- const struct nlattr *nla,
- u8 genmask)
+static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_trans *trans;
u32 id = ntohl(nla_get_be32(nla));
@@ -2746,12 +2908,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
{
struct nft_set *set;
- set = nf_tables_set_lookup(table, nla_set_name, genmask);
+ set = nft_set_lookup(table, nla_set_name, genmask);
if (IS_ERR(set)) {
if (!nla_set_id)
return set;
- set = nf_tables_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, nla_set_id, genmask);
}
return set;
}
@@ -2811,6 +2973,27 @@ cont:
return 0;
}
+static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+{
+ u64 ms = be64_to_cpu(nla_get_be64(nla));
+ u64 max = (u64)(~((u64)0));
+
+ max = div_u64(max, NSEC_PER_MSEC);
+ if (ms >= max)
+ return -ERANGE;
+
+ ms *= NSEC_PER_MSEC;
+ *result = nsecs_to_jiffies64(ms);
+ return 0;
+}
+
+static __be64 nf_jiffies64_to_msecs(u64 input)
+{
+ u64 ms = jiffies64_to_nsecs(input);
+
+ return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -2858,7 +3041,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (set->timeout &&
nla_put_be64(skb, NFTA_SET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(set->timeout)),
+ nf_jiffies64_to_msecs(set->timeout),
NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
@@ -2983,6 +3166,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
return 0;
}
+/* called with rcu_read_lock held */
static int nf_tables_getset(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2996,7 +3180,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
@@ -3004,17 +3189,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_sets,
.done = nf_tables_dump_sets_done,
+ .module = THIS_MODULE,
};
struct nft_ctx *ctx_dump;
- ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+ ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
if (ctx_dump == NULL)
return -ENOMEM;
*ctx_dump = ctx;
c.data = ctx_dump;
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
/* Only accept unspec with dump */
@@ -3023,11 +3209,11 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
return -ENOMEM;
@@ -3153,8 +3339,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- nla[NFTA_SET_TIMEOUT])));
+
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3175,22 +3363,28 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
- table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
- set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
- if (PTR_ERR(set) != -ENOENT)
+ if (PTR_ERR(set) != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return -EEXIST;
+ }
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+
return 0;
}
@@ -3233,6 +3427,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
+ set->table = table;
+ write_pnet(&set->net, net);
set->ops = ops;
set->ktype = ktype;
set->klen = desc.klen;
@@ -3267,14 +3463,14 @@ err3:
err2:
kvfree(set);
err1:
- module_put(ops->type->owner);
+ module_put(to_set_type(ops)->owner);
return err;
}
static void nft_set_destroy(struct nft_set *set)
{
set->ops->destroy(set);
- module_put(set->ops->type->owner);
+ module_put(to_set_type(set->ops)->owner);
kfree(set->name);
kvfree(set);
}
@@ -3293,6 +3489,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
int err;
@@ -3302,20 +3499,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- if (nla[NFTA_SET_HANDLE])
- set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
- else
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
- return PTR_ERR(set);
+ if (nla[NFTA_SET_HANDLE]) {
+ attr = nla[NFTA_SET_HANDLE];
+ set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+ } else {
+ attr = nla[NFTA_SET_NAME];
+ set = nft_set_lookup(ctx.table, attr, genmask);
+ }
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return PTR_ERR(set);
+ }
if (!list_empty(&set->bindings) ||
- (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0))
+ (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
return nft_delset(&ctx, set);
}
@@ -3405,8 +3610,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u64),
},
[NFT_SET_EXT_EXPIRATION] = {
- .len = sizeof(unsigned long),
- .align = __alignof__(unsigned long),
+ .len = sizeof(u64),
+ .align = __alignof__(u64),
},
[NFT_SET_EXT_USERDATA] = {
.len = sizeof(struct nft_userdata),
@@ -3443,16 +3648,19 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
+ struct netlink_ext_ack *extack,
u8 genmask)
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
int family = nfmsg->nfgen_family;
struct nft_table *table;
- table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
return PTR_ERR(table);
+ }
nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
return 0;
@@ -3496,22 +3704,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(
- *nft_set_ext_timeout(ext))),
+ nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- unsigned long expires, now = jiffies;
+ u64 expires, now = get_jiffies_64();
expires = *nft_set_ext_expiration(ext);
- if (time_before(now, expires))
+ if (time_before64(now, expires))
expires -= now;
else
expires = 0;
if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- cpu_to_be64(jiffies_to_msecs(expires)),
+ nf_jiffies64_to_msecs(expires),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
}
@@ -3749,7 +3956,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ext = nft_set_elem_ext(set, &elem);
err = -ENOMEM;
- skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb == NULL)
goto err1;
@@ -3771,6 +3978,7 @@ err1:
return err == -EAGAIN ? -ENOBUFS : err;
}
+/* called with rcu_read_lock held */
static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -3782,12 +3990,12 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
@@ -3795,10 +4003,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_set,
.done = nf_tables_dump_set_done,
+ .module = THIS_MODULE,
};
struct nft_set_dump_ctx *dump_ctx;
- dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+ dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
if (!dump_ctx)
return -ENOMEM;
@@ -3806,7 +4015,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
dump_ctx->ctx = ctx;
c.data = dump_ctx;
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -3886,7 +4095,7 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_data(ext), data, set->dlen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
*nft_set_ext_expiration(ext) =
- jiffies + timeout;
+ get_jiffies_64() + timeout;
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
@@ -3897,12 +4106,24 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+ struct nft_ctx ctx = {
+ .net = read_pnet(&set->net),
+ .family = set->table->family,
+ };
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+ struct nft_expr *expr = nft_set_ext_expr(ext);
+
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(&ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(&ctx, expr);
+ }
+ }
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
(*nft_set_ext_obj(ext))->use--;
kfree(elem);
@@ -3912,12 +4133,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
/* Only called from commit path, nft_set_elem_deactivate() already deals with
* the refcounting from the preparation phase.
*/
-static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set, void *elem)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
kfree(elem);
}
@@ -3973,8 +4195,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- nla[NFTA_SET_ELEM_TIMEOUT])));
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
+ &timeout);
+ if (err)
+ return err;
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}
@@ -3999,8 +4223,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -EINVAL;
goto err2;
}
- obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
- set->objtype, genmask);
+ obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err2;
@@ -4035,6 +4259,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
d2.type, d2.len);
if (err < 0)
goto err3;
+
+ if (d2.type == NFT_DATA_VERDICT &&
+ (data.verdict.code == NFT_GOTO ||
+ data.verdict.code == NFT_JUMP))
+ nft_validate_state_update(ctx->net,
+ NFT_VALIDATE_NEED);
}
nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4134,12 +4364,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
- int rem, err = 0;
+ int rem, err;
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
@@ -4154,9 +4385,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
if (err < 0)
- break;
+ return err;
}
- return err;
+
+ if (net->nft.validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, ctx.table);
+
+ return 0;
}
/**
@@ -4327,12 +4562,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+ genmask);
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
+ set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4420,13 +4655,13 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
}
EXPORT_SYMBOL_GPL(nft_unregister_obj);
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u32 objtype, u8 genmask)
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u32 objtype,
+ u8 genmask)
{
struct nft_object *obj;
- list_for_each_entry(obj, &table->objects, list) {
+ list_for_each_entry_rcu(obj, &table->objects, list) {
if (!nla_strcmp(nla, obj->name) &&
objtype == obj->ops->type->type &&
nft_active_genmask(obj, genmask))
@@ -4434,11 +4669,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
}
return ERR_PTR(-ENOENT);
}
-EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+EXPORT_SYMBOL_GPL(nft_obj_lookup);
-static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla,
- u32 objtype, u8 genmask)
+static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
{
struct nft_object *obj;
@@ -4582,22 +4817,25 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
!nla[NFTA_OBJ_DATA])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- if (err != -ENOENT)
+ if (err != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return err;
-
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
-
+ }
return 0;
}
@@ -4632,7 +4870,7 @@ err3:
kfree(obj->name);
err2:
if (obj->ops->destroy)
- obj->ops->destroy(obj);
+ obj->ops->destroy(&ctx, obj);
kfree(obj);
err1:
module_put(type->owner);
@@ -4753,12 +4991,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
{
struct nft_obj_filter *filter;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
if (!filter)
return ERR_PTR(-ENOMEM);
if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+ filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
if (!filter->table) {
kfree(filter);
return ERR_PTR(-ENOMEM);
@@ -4770,6 +5008,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
return filter;
}
+/* called with rcu_read_lock held */
static int nf_tables_getobj(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -4789,6 +5028,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_obj,
.done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_OBJ_TABLE] ||
@@ -4801,24 +5041,27 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
c.data = filter;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
- if (IS_ERR(obj))
+ obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return PTR_ERR(obj);
+ }
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -4837,10 +5080,10 @@ err:
return err;
}
-static void nft_obj_destroy(struct nft_object *obj)
+static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
{
if (obj->ops->destroy)
- obj->ops->destroy(obj);
+ obj->ops->destroy(ctx, obj);
module_put(obj->ops->type->owner);
kfree(obj->name);
@@ -4855,6 +5098,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_object *obj;
struct nft_ctx ctx;
@@ -4864,22 +5108,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
(!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
+ }
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- if (nla[NFTA_OBJ_HANDLE])
- obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
- objtype, genmask);
- else
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
- objtype, genmask);
- if (IS_ERR(obj))
+ if (nla[NFTA_OBJ_HANDLE]) {
+ attr = nla[NFTA_OBJ_HANDLE];
+ obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
+ } else {
+ attr = nla[NFTA_OBJ_NAME];
+ obj = nft_obj_lookup(table, attr, objtype, genmask);
+ }
+
+ if (IS_ERR(obj)) {
+ NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(obj);
- if (obj->use > 0)
+ }
+ if (obj->use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -4950,24 +5201,23 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
[NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
};
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
- const struct nlattr *nla,
- u8 genmask)
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
- list_for_each_entry(flowtable, &table->flowtables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
if (!nla_strcmp(nla, flowtable->name) &&
nft_active_genmask(flowtable, genmask))
return flowtable;
}
return ERR_PTR(-ENOENT);
}
-EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
static struct nft_flowtable *
-nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
- const struct nlattr *nla, u8 genmask)
+nft_flowtable_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
@@ -5066,7 +5316,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
flowtable->ops[i].pf = NFPROTO_NETDEV;
flowtable->ops[i].hooknum = hooknum;
flowtable->ops[i].priority = priority;
- flowtable->ops[i].priv = &flowtable->data.rhashtable;
+ flowtable->ops[i].priv = &flowtable->data;
flowtable->ops[i].hook = flowtable->data.type->hook;
flowtable->ops[i].dev = dev_array[i];
flowtable->dev_name[i] = kstrdup(dev_array[i]->name,
@@ -5107,23 +5357,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
return ERR_PTR(-ENOENT);
}
-void nft_flow_table_iterate(struct net *net,
- void (*iter)(struct nf_flowtable *flowtable, void *data),
- void *data)
-{
- struct nft_flowtable *flowtable;
- const struct nft_table *table;
-
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- iter(&flowtable->data, data);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
static void nft_unregister_flowtable_net_hooks(struct net *net,
struct nft_flowtable *flowtable)
{
@@ -5157,20 +5390,26 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
!nla[NFTA_FLOWTABLE_HOOK])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ genmask);
if (IS_ERR(flowtable)) {
err = PTR_ERR(flowtable);
- if (err != -ENOENT)
+ if (err != -ENOENT) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return err;
+ }
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return -EEXIST;
+ }
return 0;
}
@@ -5197,14 +5436,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
}
flowtable->data.type = type;
- err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+ err = type->init(&flowtable->data);
if (err < 0)
goto err3;
err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
flowtable);
if (err < 0)
- goto err3;
+ goto err4;
for (i = 0; i < flowtable->ops_len; i++) {
if (!flowtable->ops[i].dev)
@@ -5218,37 +5457,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (flowtable->ops[i].dev == ft->ops[k].dev &&
flowtable->ops[i].pf == ft->ops[k].pf) {
err = -EBUSY;
- goto err4;
+ goto err5;
}
}
}
err = nf_register_net_hook(net, &flowtable->ops[i]);
if (err < 0)
- goto err4;
+ goto err5;
}
err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err5;
-
- INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
- queue_delayed_work(system_power_efficient_wq,
- &flowtable->data.gc_work, HZ);
+ goto err6;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
table->use++;
return 0;
-err5:
+err6:
i = flowtable->ops_len;
-err4:
+err5:
for (k = i - 1; k >= 0; k--) {
kfree(flowtable->dev_name[k]);
nf_unregister_net_hook(net, &flowtable->ops[k]);
}
kfree(flowtable->ops);
+err4:
+ flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
err2:
@@ -5268,6 +5505,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
@@ -5276,23 +5514,29 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
!nla[NFTA_FLOWTABLE_HANDLE]))
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
- if (IS_ERR(table))
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- if (nla[NFTA_FLOWTABLE_HANDLE])
- flowtable = nf_tables_flowtable_lookup_byhandle(table,
- nla[NFTA_FLOWTABLE_HANDLE],
- genmask);
- else
- flowtable = nf_tables_flowtable_lookup(table,
- nla[NFTA_FLOWTABLE_NAME],
- genmask);
- if (IS_ERR(flowtable))
- return PTR_ERR(flowtable);
- if (flowtable->use > 0)
+ if (nla[NFTA_FLOWTABLE_HANDLE]) {
+ attr = nla[NFTA_FLOWTABLE_HANDLE];
+ flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
+ } else {
+ attr = nla[NFTA_FLOWTABLE_NAME];
+ flowtable = nft_flowtable_lookup(table, attr, genmask);
+ }
+
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return PTR_ERR(flowtable);
+ }
+ if (flowtable->use > 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
+ }
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -5423,13 +5667,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
{
struct nft_flowtable_filter *filter;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
if (!filter)
return ERR_PTR(-ENOMEM);
if (nla[NFTA_FLOWTABLE_TABLE]) {
filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!filter->table) {
kfree(filter);
return ERR_PTR(-ENOMEM);
@@ -5438,6 +5682,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
return filter;
}
+/* called with rcu_read_lock held */
static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -5456,6 +5701,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
struct netlink_dump_control c = {
.dump = nf_tables_dump_flowtable,
.done = nf_tables_dump_flowtable_done,
+ .module = THIS_MODULE,
};
if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5467,23 +5713,23 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
c.data = filter;
}
- return netlink_dump_start(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
if (!nla[NFTA_FLOWTABLE_NAME])
return -EINVAL;
- table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
- family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+ genmask);
if (IS_ERR(table))
return PTR_ERR(table);
- flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
@@ -5532,11 +5778,9 @@ err:
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
- cancel_delayed_work_sync(&flowtable->data.gc_work);
kfree(flowtable->ops);
kfree(flowtable->name);
flowtable->data.type->free(&flowtable->data);
- rhashtable_destroy(&flowtable->data.rhashtable);
module_put(flowtable->data.type->owner);
}
@@ -5649,7 +5893,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
struct sk_buff *skb2;
int err;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
return -ENOMEM;
@@ -5671,7 +5915,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
- .call = nf_tables_gettable,
+ .call_rcu = nf_tables_gettable,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
@@ -5686,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
- .call = nf_tables_getchain,
+ .call_rcu = nf_tables_getchain,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
@@ -5701,7 +5945,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
- .call = nf_tables_getrule,
+ .call_rcu = nf_tables_getrule,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
@@ -5716,7 +5960,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
- .call = nf_tables_getset,
+ .call_rcu = nf_tables_getset,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
@@ -5731,7 +5975,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
- .call = nf_tables_getsetelem,
+ .call_rcu = nf_tables_getsetelem,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
@@ -5741,7 +5985,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
- .call = nf_tables_getgen,
+ .call_rcu = nf_tables_getgen,
},
[NFT_MSG_NEWOBJ] = {
.call_batch = nf_tables_newobj,
@@ -5749,7 +5993,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
- .call = nf_tables_getobj,
+ .call_rcu = nf_tables_getobj,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
@@ -5759,7 +6003,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call = nf_tables_getobj,
+ .call_rcu = nf_tables_getobj,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
@@ -5769,7 +6013,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
- .call = nf_tables_getflowtable,
+ .call_rcu = nf_tables_getflowtable,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
@@ -5780,12 +6024,41 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
},
};
+static int nf_tables_validate(struct net *net)
+{
+ struct nft_table *table;
+
+ switch (net->nft.validate_state) {
+ case NFT_VALIDATE_SKIP:
+ break;
+ case NFT_VALIDATE_NEED:
+ nft_validate_state_update(net, NFT_VALIDATE_DO);
+ /* fall through */
+ case NFT_VALIDATE_DO:
+ list_for_each_entry(table, &net->nft.tables, list) {
+ if (nft_table_validate(net, table) < 0)
+ return -EAGAIN;
+ }
+ break;
+ }
+
+ return 0;
+}
+
static void nft_chain_commit_update(struct nft_trans *trans)
{
struct nft_base_chain *basechain;
- if (nft_trans_chain_name(trans))
+ if (nft_trans_chain_name(trans)) {
+ rhltable_remove(&trans->ctx.table->chains_ht,
+ &trans->ctx.chain->rhlhead,
+ nft_chain_ht_params);
swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
+ rhltable_insert_key(&trans->ctx.table->chains_ht,
+ trans->ctx.chain->name,
+ &trans->ctx.chain->rhlhead,
+ nft_chain_ht_params);
+ }
if (!nft_is_base_chain(trans->ctx.chain))
return;
@@ -5817,11 +6090,12 @@ static void nft_commit_release(struct nft_trans *trans)
nft_set_destroy(nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+ nf_tables_set_elem_destroy(&trans->ctx,
+ nft_trans_elem_set(trans),
nft_trans_elem(trans).priv);
break;
case NFT_MSG_DELOBJ:
- nft_obj_destroy(nft_trans_obj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -5845,21 +6119,175 @@ static void nf_tables_commit_release(struct net *net)
}
}
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+ struct nft_rule *rule;
+ unsigned int alloc = 0;
+ int i;
+
+ /* already handled or inactive chain? */
+ if (chain->rules_next || !nft_is_active_next(net, chain))
+ return 0;
+
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+ i = 0;
+
+ list_for_each_entry_continue(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule))
+ alloc++;
+ }
+
+ chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+ if (!chain->rules_next)
+ return -ENOMEM;
+
+ list_for_each_entry_continue(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule))
+ chain->rules_next[i++] = rule;
+ }
+
+ chain->rules_next[i] = NULL;
+ return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+ struct nft_trans *trans, *next;
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ struct nft_chain *chain = trans->ctx.chain;
+
+ if (trans->msg_type == NFT_MSG_NEWRULE ||
+ trans->msg_type == NFT_MSG_DELRULE) {
+ kvfree(chain->rules_next);
+ chain->rules_next = NULL;
+ }
+ }
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+ struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+ kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+ struct nft_rule **r = rules;
+ struct nft_rules_old *old;
+
+ while (*r)
+ r++;
+
+ r++; /* rcu_head is after end marker */
+ old = (void *) r;
+ old->start = rules;
+
+ call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+ struct nft_rule **g0, **g1;
+ bool next_genbit;
+
+ next_genbit = nft_gencursor_next(net);
+
+ g0 = rcu_dereference_protected(chain->rules_gen_0,
+ lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ g1 = rcu_dereference_protected(chain->rules_gen_1,
+ lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+ /* No changes to this chain? */
+ if (chain->rules_next == NULL) {
+ /* chain had no change in last or next generation */
+ if (g0 == g1)
+ return;
+ /*
+ * chain had no change in this generation; make sure next
+ * one uses same rules as current generation.
+ */
+ if (next_genbit) {
+ rcu_assign_pointer(chain->rules_gen_1, g0);
+ nf_tables_commit_chain_free_rules_old(g1);
+ } else {
+ rcu_assign_pointer(chain->rules_gen_0, g1);
+ nf_tables_commit_chain_free_rules_old(g0);
+ }
+
+ return;
+ }
+
+ if (next_genbit)
+ rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ else
+ rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+ chain->rules_next = NULL;
+
+ if (g0 == g1)
+ return;
+
+ if (next_genbit)
+ nf_tables_commit_chain_free_rules_old(g1);
+ else
+ nf_tables_commit_chain_free_rules_old(g0);
+}
+
+static void nft_chain_del(struct nft_chain *chain)
+{
+ struct nft_table *table = chain->table;
+
+ WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
+ nft_chain_ht_params));
+ list_del_rcu(&chain->list);
+}
+
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
+ struct nft_chain *chain;
+ struct nft_table *table;
- /* Bump generation counter, invalidate any dump in progress */
- while (++net->nft.base_seq == 0);
+ /* 0. Validate ruleset, otherwise roll back for error reporting. */
+ if (nf_tables_validate(net) < 0)
+ return -EAGAIN;
- /* A new generation has just started */
- net->nft.gencursor = nft_gencursor_next(net);
+ /* 1. Allocate space for next generation rules_gen_X[] */
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ int ret;
+
+ if (trans->msg_type == NFT_MSG_NEWRULE ||
+ trans->msg_type == NFT_MSG_DELRULE) {
+ chain = trans->ctx.chain;
+
+ ret = nf_tables_commit_chain_prepare(net, chain);
+ if (ret < 0) {
+ nf_tables_commit_chain_prepare_cancel(net);
+ return ret;
+ }
+ }
+ }
+
+ /* step 2. Make rules_gen_X visible to packet path */
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_active_next(net, chain))
+ continue;
+ nf_tables_commit_chain_active(net, chain);
+ }
+ }
- /* Make sure all packets have left the previous generation before
- * purging old rules.
+ /*
+ * Bump generation counter, invalidate any dump in progress.
+ * Cannot fail after this point.
*/
- synchronize_rcu();
+ while (++net->nft.base_seq == 0);
+
+ /* step 3. Start new generation, rules_gen_X now in use. */
+ net->nft.gencursor = nft_gencursor_next(net);
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
switch (trans->msg_type) {
@@ -5890,7 +6318,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
case NFT_MSG_DELCHAIN:
- list_del_rcu(&trans->ctx.chain->list);
+ nft_chain_del(trans->ctx.chain);
nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
nf_tables_unregister_hook(trans->ctx.net,
trans->ctx.table,
@@ -6001,7 +6429,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nft_trans_elem(trans).priv, true);
break;
case NFT_MSG_NEWOBJ:
- nft_obj_destroy(nft_trans_obj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6041,7 +6469,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
} else {
trans->ctx.table->use--;
- list_del_rcu(&trans->ctx.chain->list);
+ nft_chain_del(trans->ctx.chain);
nf_tables_unregister_hook(trans->ctx.net,
trans->ctx.table,
trans->ctx.chain);
@@ -6121,6 +6549,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
return 0;
}
+static void nf_tables_cleanup(struct net *net)
+{
+ nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
return net->nft.base_seq == genid;
@@ -6133,6 +6566,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
+ .cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
};
@@ -6216,19 +6650,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
list_for_each_entry(rule, &chain->rules, list) {
nft_rule_for_each_expr(expr, last, rule) {
- const struct nft_data *data = NULL;
+ struct nft_immediate_expr *priv;
+ const struct nft_data *data;
int err;
- if (!expr->ops->validate)
+ if (strcmp(expr->ops->type->name, "immediate"))
continue;
- err = expr->ops->validate(ctx, expr, &data);
- if (err < 0)
- return err;
-
- if (data == NULL)
+ priv = nft_expr_priv(expr);
+ if (priv->dreg != NFT_REG_VERDICT)
continue;
+ data = &priv->data;
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
@@ -6461,8 +6894,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
case NFT_GOTO:
if (!tb[NFTA_VERDICT_CHAIN])
return -EINVAL;
- chain = nf_tables_chain_lookup(ctx->table,
- tb[NFTA_VERDICT_CHAIN], genmask);
+ chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
+ genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
@@ -6638,7 +7071,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
ctx->chain->use--;
nf_tables_rule_release(ctx, rule);
}
- list_del(&ctx->chain->list);
+ nft_chain_del(ctx->chain);
ctx->table->use--;
nf_tables_chain_destroy(ctx);
@@ -6690,11 +7123,11 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry_safe(obj, ne, &table->objects, list) {
list_del(&obj->list);
table->use--;
- nft_obj_destroy(obj);
+ nft_obj_destroy(&ctx, obj);
}
list_for_each_entry_safe(chain, nc, &table->chains, list) {
ctx.chain = chain;
- list_del(&chain->list);
+ nft_chain_del(chain);
table->use--;
nf_tables_chain_destroy(&ctx);
}
@@ -6708,6 +7141,8 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
net->nft.base_seq = 1;
+ net->nft.validate_state = NFT_VALIDATE_SKIP;
+
return 0;
}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 40e744572283..deff10adef9c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -23,25 +23,9 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
-static const char *const comments[__NFT_TRACETYPE_MAX] = {
- [NFT_TRACETYPE_POLICY] = "policy",
- [NFT_TRACETYPE_RETURN] = "return",
- [NFT_TRACETYPE_RULE] = "rule",
-};
-
-static const struct nf_loginfo trace_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_WARNING,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
- int rulenum, enum nft_trace_types type)
+ enum nft_trace_types type)
{
const struct nft_pktinfo *pkt = info->pkt;
@@ -52,22 +36,16 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
info->type = type;
nft_trace_notify(info);
-
- nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
- nft_in(pkt), nft_out(pkt), &trace_loginfo,
- "TRACE: %s:%s:%s:%u ",
- chain->table->name, chain->name, comments[type], rulenum);
}
static inline void nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
const struct nft_rule *rule,
- int rulenum,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
info->rule = rule;
- __nft_trace_packet(info, chain, rulenum, type);
+ __nft_trace_packet(info, chain, type);
}
}
@@ -139,8 +117,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
struct nft_jumpstack {
const struct nft_chain *chain;
- const struct nft_rule *rule;
- int rulenum;
+ struct nft_rule *const *rules;
};
unsigned int
@@ -148,31 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct net *net = nft_net(pkt);
+ struct nft_rule *const *rules;
const struct nft_rule *rule;
const struct nft_expr *expr, *last;
struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
- int rulenum;
- unsigned int gencursor = nft_genmask_cur(net);
+ bool genbit = READ_ONCE(net->nft.gencursor);
struct nft_traceinfo info;
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
- rulenum = 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
+ if (genbit)
+ rules = rcu_dereference(chain->rules_gen_1);
+ else
+ rules = rcu_dereference(chain->rules_gen_0);
+
next_rule:
+ rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
- /* This rule is not active, skip. */
- if (unlikely(rule->genmask & gencursor))
- continue;
-
- rulenum++;
-
+ for (; *rules ; rules++) {
+ rule = *rules;
nft_rule_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
@@ -190,7 +165,7 @@ next_rule:
continue;
case NFT_CONTINUE:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
continue;
}
break;
@@ -202,7 +177,7 @@ next_rule:
case NF_QUEUE:
case NF_STOLEN:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
return regs.verdict.code;
}
@@ -210,22 +185,20 @@ next_rule:
case NFT_JUMP:
BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rule = rule;
- jumpstack[stackptr].rulenum = rulenum;
+ jumpstack[stackptr].rules = rules + 1;
stackptr++;
/* fall through */
case NFT_GOTO:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RULE);
+ NFT_TRACETYPE_RULE);
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
- rulenum++;
/* fall through */
case NFT_RETURN:
nft_trace_packet(&info, chain, rule,
- rulenum, NFT_TRACETYPE_RETURN);
+ NFT_TRACETYPE_RETURN);
break;
default:
WARN_ON(1);
@@ -234,13 +207,11 @@ next_rule:
if (stackptr > 0) {
stackptr--;
chain = jumpstack[stackptr].chain;
- rule = jumpstack[stackptr].rule;
- rulenum = jumpstack[stackptr].rulenum;
+ rules = jumpstack[stackptr].rules;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, -1,
- NFT_TRACETYPE_POLICY);
+ nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
@@ -258,6 +229,9 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_payload_type,
&nft_dynset_type,
&nft_range_type,
+ &nft_meta_type,
+ &nft_rt_type,
+ &nft_exthdr_type,
};
int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..4d0da7042aff 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -25,6 +25,7 @@
#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
+#include <linux/sched/signal.h>
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
@@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
rcu_dereference_protected(table[(id)].subsys, \
lockdep_nfnl_is_held((id)))
+#define NFNL_MAX_ATTR_COUNT 32
+
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
@@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
+ u8 cb_id;
+
+ /* Sanity-check attr_count size to avoid stack buffer overflow. */
+ for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+ if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+ return -EINVAL;
+
nfnl_lock(n->subsys_id);
if (table[n->subsys_id].subsys) {
nfnl_unlock(n->subsys_id);
@@ -185,11 +195,17 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
- struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
+ /* Sanity-check NFNL_MAX_ATTR_COUNT */
+ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
ss->cb[cb_id].policy, extack);
if (err < 0) {
@@ -330,6 +346,13 @@ replay:
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
+ if (fatal_signal_pending(current)) {
+ nfnl_err_reset(&err_list);
+ err = -EINTR;
+ status = NFNL_BATCH_FAILURE;
+ goto done;
+ }
+
memset(&extack, 0, sizeof(extack));
nlh = nlmsg_hdr(skb);
err = 0;
@@ -379,10 +402,16 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
- struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
+ /* Sanity-check NFTA_MAX_ATTR */
+ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+ err = -ENOMEM;
+ goto ack;
+ }
+
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
attrlen, ss->cb[cb_id].policy, NULL);
if (err < 0)
@@ -441,10 +470,19 @@ done:
kfree_skb(skb);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
- ss->commit(net, oskb);
+ err = ss->commit(net, oskb);
+ if (err == -EAGAIN) {
+ status |= NFNL_BATCH_REPLAY;
+ goto done;
+ } else if (err) {
+ ss->abort(net, oskb);
+ netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+ }
} else {
ss->abort(net, oskb);
}
+ if (ss->cleanup)
+ ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index c14822b9729f..332c69d27b47 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
#include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_log.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
@@ -47,6 +46,7 @@
#include "../bridge/br_private.h"
#endif
+#define NFULNL_COPY_DISABLED 0xff
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* log handler for internal netfilter logging api */
-void
+static void
nfulnl_log_packet(struct net *net,
u_int8_t pf,
unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
struct nfulnl_instance *inst;
const struct nf_loginfo *li;
unsigned int qthreshold;
- unsigned int plen;
+ unsigned int plen = 0;
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
if (!inst)
return;
- plen = 0;
if (prefix)
plen = strlen(prefix) + 1;
@@ -760,7 +759,6 @@ alloc_failure:
/* FIXME: statistics */
goto unlock_and_release;
}
-EXPORT_SYMBOL_GPL(nfulnl_log_packet);
static int
nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 494a9ab35cb6..4ccd2988f9db 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
return entry;
}
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ struct nf_ct_hook *ct_hook;
+ int err;
+
+ if (verdict == NF_ACCEPT ||
+ verdict == NF_STOP) {
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook) {
+ err = ct_hook->update(entry->state.net, entry->skb);
+ if (err < 0)
+ verdict = NF_DROP;
+ }
+ rcu_read_unlock();
+ }
+ nf_reinject(entry, verdict);
+}
+
static void
nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
{
@@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
if (!cmpfn || cmpfn(entry, data)) {
list_del(&entry->list);
queue->queue_total--;
- nf_reinject(entry, NF_DROP);
+ nfqnl_reinject(entry, NF_DROP);
}
}
spin_unlock_bh(&queue->lock);
@@ -686,7 +705,7 @@ err_out_free_nskb:
err_out_unlock:
spin_unlock_bh(&queue->lock);
if (failopen)
- nf_reinject(entry, NF_ACCEPT);
+ nfqnl_reinject(entry, NF_ACCEPT);
err_out:
return err;
}
@@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
list_for_each_entry_safe(entry, tmp, &batch_list, list) {
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
- nf_reinject(entry, verdict);
+
+ nfqnl_reinject(entry, verdict);
}
return 0;
}
@@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
- nf_reinject(entry, verdict);
+ nfqnl_reinject(entry, verdict);
return 0;
}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 1d99a1efdafc..8d1ff654e5af 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -611,10 +611,10 @@ nla_put_failure:
return -1;
}
-static int nfnl_compat_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[],
+ struct netlink_ext_ack *extack)
{
int ret = 0, target;
struct nfgenmsg *nfmsg;
@@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
return -EINVAL;
}
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
rev, target, &ret),
fmt, name);
-
if (ret < 0)
- return ret;
+ goto out_put;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL)
- return -ENOMEM;
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
/* include the best revision for this extension in the message */
if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
@@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
nfmsg->nfgen_family,
name, ret, target) <= 0) {
kfree_skb(skb2);
- return -ENOSPC;
+ goto out_put;
}
ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
MSG_DONTWAIT);
if (ret > 0)
ret = 0;
-
+out_put:
+ rcu_read_lock();
+ module_put(THIS_MODULE);
return ret == -EAGAIN ? -ENOBUFS : ret;
}
@@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
};
static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
- [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get,
+ [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
.attr_count = NFTA_COMPAT_MAX,
.policy = nfnl_compat_policy_get },
};
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+struct nft_connlimit {
+ spinlock_t lock;
+ struct hlist_head hhead;
+ u32 limit;
+ bool invert;
+};
+
+static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct nft_set_ext *ext)
+{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ const struct nf_conntrack_tuple *tuple_ptr;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int count;
+ bool addit;
+
+ tuple_ptr = &tuple;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (ct != NULL) {
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ zone = nf_ct_zone(ct);
+ } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
+ nft_pf(pkt), nft_net(pkt), &tuple)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ spin_lock_bh(&priv->lock);
+ count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+ &addit);
+
+ if (!addit)
+ goto out;
+
+ if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+ regs->verdict.code = NF_DROP;
+ spin_unlock_bh(&priv->lock);
+ return;
+ }
+ count++;
+out:
+ spin_unlock_bh(&priv->lock);
+
+ if ((count > priv->limit) ^ priv->invert) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
+static int nft_connlimit_do_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_connlimit *priv)
+{
+ bool invert = false;
+ u32 flags, limit;
+
+ if (!tb[NFTA_CONNLIMIT_COUNT])
+ return -EINVAL;
+
+ limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
+
+ if (tb[NFTA_CONNLIMIT_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
+ if (flags & ~NFT_CONNLIMIT_F_INV)
+ return -EOPNOTSUPP;
+ if (flags & NFT_CONNLIMIT_F_INV)
+ invert = true;
+ }
+
+ spin_lock_init(&priv->lock);
+ INIT_HLIST_HEAD(&priv->hhead);
+ priv->limit = limit;
+ priv->invert = invert;
+
+ return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
+ struct nft_connlimit *priv)
+{
+ nf_ct_netns_put(ctx->net, ctx->family);
+ nf_conncount_cache_free(&priv->hhead);
+}
+
+static int nft_connlimit_do_dump(struct sk_buff *skb,
+ struct nft_connlimit *priv)
+{
+ if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
+ goto nla_put_failure;
+ if (priv->invert &&
+ nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline void nft_connlimit_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ return nft_connlimit_do_dump(skb, priv);
+}
+
+static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
+ [NFTA_CONNLIMIT_COUNT] = { .type = NLA_U32 },
+ [NFTA_CONNLIMIT_FLAGS] = { .type = NLA_U32 },
+};
+
+static struct nft_object_type nft_connlimit_obj_type;
+static const struct nft_object_ops nft_connlimit_obj_ops = {
+ .type = &nft_connlimit_obj_type,
+ .size = sizeof(struct nft_connlimit),
+ .eval = nft_connlimit_obj_eval,
+ .init = nft_connlimit_obj_init,
+ .destroy = nft_connlimit_obj_destroy,
+ .dump = nft_connlimit_obj_dump,
+};
+
+static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CONNLIMIT,
+ .ops = &nft_connlimit_obj_ops,
+ .maxattr = NFTA_CONNLIMIT_MAX,
+ .policy = nft_connlimit_policy,
+ .owner = THIS_MODULE,
+};
+
+static void nft_connlimit_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ return nft_connlimit_do_dump(skb, priv);
+}
+
+static int nft_connlimit_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_connlimit *priv_dst = nft_expr_priv(dst);
+ struct nft_connlimit *priv_src = nft_expr_priv(src);
+
+ spin_lock_init(&priv_dst->lock);
+ INIT_HLIST_HEAD(&priv_dst->hhead);
+ priv_dst->limit = priv_src->limit;
+ priv_dst->invert = priv_src->invert;
+
+ return 0;
+}
+
+static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+
+ nf_conncount_cache_free(&priv->hhead);
+}
+
+static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
+{
+ struct nft_connlimit *priv = nft_expr_priv(expr);
+ bool addit, ret;
+
+ spin_lock_bh(&priv->lock);
+ nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+
+ ret = hlist_empty(&priv->hhead);
+ spin_unlock_bh(&priv->lock);
+
+ return ret;
+}
+
+static struct nft_expr_type nft_connlimit_type;
+static const struct nft_expr_ops nft_connlimit_ops = {
+ .type = &nft_connlimit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
+ .eval = nft_connlimit_eval,
+ .init = nft_connlimit_init,
+ .destroy = nft_connlimit_destroy,
+ .clone = nft_connlimit_clone,
+ .destroy_clone = nft_connlimit_destroy_clone,
+ .dump = nft_connlimit_dump,
+ .gc = nft_connlimit_gc,
+};
+
+static struct nft_expr_type nft_connlimit_type __read_mostly = {
+ .name = "connlimit",
+ .ops = &nft_connlimit_ops,
+ .policy = nft_connlimit_policy,
+ .maxattr = NFTA_CONNLIMIT_MAX,
+ .flags = NFT_EXPR_STATEFUL | NFT_EXPR_GC,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_connlimit_module_init(void)
+{
+ int err;
+
+ err = nft_register_obj(&nft_connlimit_obj_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_connlimit_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_connlimit_obj_type);
+ return err;
+}
+
+static void __exit nft_connlimit_module_exit(void)
+{
+ nft_unregister_expr(&nft_connlimit_type);
+ nft_unregister_obj(&nft_connlimit_obj_type);
+}
+
+module_init(nft_connlimit_module_init);
+module_exit(nft_connlimit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso");
+MODULE_ALIAS_NFT_EXPR("connlimit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index eefe3b409925..a61d7edfc290 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
free_percpu(priv->counter);
}
-static void nft_counter_obj_destroy(struct nft_object *obj)
+static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
{
struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
@@ -257,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = {
.eval = nft_counter_eval,
.init = nft_counter_init,
.destroy = nft_counter_destroy,
+ .destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
};
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 5c0de704bad5..1435ffc5f57e 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
return 0;
}
-static void nft_ct_helper_obj_destroy(struct nft_object *obj)
+static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
{
struct nft_ct_helper_obj *priv = nft_obj_data(obj);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 04863fad05dd..4d49529cff61 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
u64 timeout;
void *elem;
- if (set->size && !atomic_add_unless(&set->nelems, 1, set->size))
+ if (!atomic_add_unless(&set->nelems, 1, set->size))
return NULL;
timeout = priv->timeout ? : set->timeout;
@@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
timeout = priv->timeout ? : set->timeout;
- *nft_set_ext_expiration(ext) = jiffies + timeout;
+ *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
}
if (sexpr != NULL)
@@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
err = -EOPNOTSUPP;
if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
goto err1;
+
+ if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err1;
+ if (!set->ops->gc_init)
+ goto err1;
+ set->ops->gc_init(set);
+ }
+
} else if (set->flags & NFT_SET_EVAL)
return -EINVAL;
@@ -216,6 +225,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (err < 0)
goto err1;
+ if (set->size == 0)
+ set->size = 0xffff;
+
priv->set = set;
return 0;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..a940c9fd9045 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,11 +10,10 @@
#include <asm/unaligned.h>
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/tcp.h>
@@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static struct nft_expr_type nft_exthdr_type;
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EOPNOTSUPP);
}
-static struct nft_expr_type nft_exthdr_type __read_mostly = {
+struct nft_expr_type nft_exthdr_type __read_mostly = {
.name = "exthdr",
.select_ops = nft_exthdr_select_ops,
.policy = nft_exthdr_policy,
.maxattr = NFTA_EXTHDR_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_exthdr_module_init(void)
-{
- return nft_register_expr(&nft_exthdr_type);
-}
-
-static void __exit nft_exthdr_module_exit(void)
-{
- nft_unregister_expr(&nft_exthdr_type);
-}
-
-module_init(nft_exthdr_module_init);
-module_exit(nft_exthdr_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b65829b2be22..d6bab8c3cbb0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
if (!tb[NFTA_FLOW_TABLE_NAME])
return -EINVAL;
- flowtable = nf_tables_flowtable_lookup(ctx->table,
- tb[NFTA_FLOW_TABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
+ genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
struct nft_fwd_netdev {
enum nft_registers sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
[NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
+ [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
+ [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
};
static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
return -1;
}
+struct nft_fwd_neigh {
+ enum nft_registers sreg_dev:8;
+ enum nft_registers sreg_addr:8;
+ u8 nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+ void *addr = &regs->data[priv->sreg_addr];
+ int oif = regs->data[priv->sreg_dev];
+ unsigned int verdict = NF_STOLEN;
+ struct sk_buff *skb = pkt->skb;
+ struct net_device *dev;
+ int neigh_table;
+
+ switch (priv->nfproto) {
+ case NFPROTO_IPV4: {
+ struct iphdr *iph;
+
+ if (skb->protocol != htons(ETH_P_IP)) {
+ verdict = NFT_BREAK;
+ goto out;
+ }
+ if (skb_try_make_writable(skb, sizeof(*iph))) {
+ verdict = NF_DROP;
+ goto out;
+ }
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+ neigh_table = NEIGH_ARP_TABLE;
+ break;
+ }
+ case NFPROTO_IPV6: {
+ struct ipv6hdr *ip6h;
+
+ if (skb->protocol != htons(ETH_P_IPV6)) {
+ verdict = NFT_BREAK;
+ goto out;
+ }
+ if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+ verdict = NF_DROP;
+ goto out;
+ }
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+ neigh_table = NEIGH_ND_TABLE;
+ break;
+ }
+ default:
+ verdict = NFT_BREAK;
+ goto out;
+ }
+
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (dev == NULL)
+ return;
+
+ skb->dev = dev;
+ neigh_xmit(neigh_table, dev, addr, skb);
+out:
+ regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+ unsigned int addr_len;
+ int err;
+
+ if (!tb[NFTA_FWD_SREG_DEV] ||
+ !tb[NFTA_FWD_SREG_ADDR] ||
+ !tb[NFTA_FWD_NFPROTO])
+ return -EINVAL;
+
+ priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+ priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+ priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+ switch (priv->nfproto) {
+ case NFPROTO_IPV4:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case NFPROTO_IPV6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ if (err < 0)
+ return err;
+
+ return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+ nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+ nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+ .type = &nft_fwd_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+ .eval = nft_fwd_neigh_eval,
+ .init = nft_fwd_neigh_init,
+ .dump = nft_fwd_neigh_dump,
+};
+
static const struct nft_expr_ops nft_fwd_netdev_ops = {
.type = &nft_fwd_netdev_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.dump = nft_fwd_netdev_dump,
};
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_FWD_SREG_ADDR])
+ return &nft_fwd_neigh_netdev_ops;
+ if (tb[NFTA_FWD_SREG_DEV])
+ return &nft_fwd_netdev_ops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
.family = NFPROTO_NETDEV,
.name = "fwd",
- .ops = &nft_fwd_netdev_ops,
+ .select_ops = nft_fwd_select_ops,
.policy = nft_fwd_netdev_policy,
.maxattr = NFTA_FWD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 24f2f7567ddb..c2d237144f74 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
u32 modulus;
u32 seed;
u32 offset;
+ struct nft_set *map;
};
static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
const void *data = &regs->data[priv->sreg];
u32 h;
- h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+ h = reciprocal_scale(jhash(data, priv->len, priv->seed),
+ priv->modulus);
+
regs->data[priv->dreg] = h + priv->offset;
}
+static void nft_jhash_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ const void *data = &regs->data[priv->sreg];
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = reciprocal_scale(jhash(data, priv->len, priv->seed),
+ priv->modulus) + priv->offset;
+
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
+}
+
struct nft_symhash {
enum nft_registers dreg:8;
u32 modulus;
u32 offset;
+ struct nft_set *map;
};
static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = h + priv->offset;
}
+static void nft_symhash_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = reciprocal_scale(__skb_get_hash_symmetric(skb),
+ priv->modulus) + priv->offset;
+
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
+}
+
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG] = { .type = NLA_U32 },
[NFTA_HASH_DREG] = { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SEED] = { .type = NLA_U32 },
[NFTA_HASH_OFFSET] = { .type = NLA_U32 },
[NFTA_HASH_TYPE] = { .type = NLA_U32 },
+ [NFTA_HASH_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_HASH_SET_ID] = { .type = NLA_U32 },
};
static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -97,7 +148,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
- if (priv->modulus <= 1)
+ if (priv->modulus < 1)
return -ERANGE;
if (priv->offset + priv->modulus - 1 < priv->offset)
@@ -115,6 +166,20 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_jhash_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_jhash_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_HASH_SET_NAME],
+ tb[NFTA_HASH_SET_ID], genmask);
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_symhash_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -141,6 +206,20 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_symhash_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_jhash *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_symhash_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_HASH_SET_NAME],
+ tb[NFTA_HASH_SET_ID], genmask);
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_jhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -168,6 +247,18 @@ nla_put_failure:
return -1;
}
+static int nft_jhash_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_jhash *priv = nft_expr_priv(expr);
+
+ if (nft_jhash_dump(skb, expr) ||
+ nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+ return -1;
+
+ return 0;
+}
+
static int nft_symhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -188,6 +279,18 @@ nla_put_failure:
return -1;
}
+static int nft_symhash_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_symhash *priv = nft_expr_priv(expr);
+
+ if (nft_symhash_dump(skb, expr) ||
+ nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+ return -1;
+
+ return 0;
+}
+
static struct nft_expr_type nft_hash_type;
static const struct nft_expr_ops nft_jhash_ops = {
.type = &nft_hash_type,
@@ -197,6 +300,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
.dump = nft_jhash_dump,
};
+static const struct nft_expr_ops nft_jhash_map_ops = {
+ .type = &nft_hash_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+ .eval = nft_jhash_map_eval,
+ .init = nft_jhash_map_init,
+ .dump = nft_jhash_map_dump,
+};
+
static const struct nft_expr_ops nft_symhash_ops = {
.type = &nft_hash_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +316,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
.dump = nft_symhash_dump,
};
+static const struct nft_expr_ops nft_symhash_map_ops = {
+ .type = &nft_hash_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
+ .eval = nft_symhash_map_eval,
+ .init = nft_symhash_map_init,
+ .dump = nft_symhash_map_dump,
+};
+
static const struct nft_expr_ops *
nft_hash_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -217,8 +336,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
switch (type) {
case NFT_HASH_SYM:
+ if (tb[NFTA_HASH_SET_NAME])
+ return &nft_symhash_map_ops;
return &nft_symhash_ops;
case NFT_HASH_JENKINS:
+ if (tb[NFTA_HASH_SET_NAME])
+ return &nft_jhash_map_ops;
return &nft_jhash_ops;
default:
break;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
-struct nft_immediate_expr {
- struct nft_data data;
- enum nft_registers dreg:8;
- u8 dlen;
-};
-
static void nft_immediate_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
static int nft_immediate_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_data **d)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data;
+ int err;
- if (priv->dreg == NFT_REG_VERDICT)
- *data = &priv->data;
+ if (priv->dreg != NFT_REG_VERDICT)
+ return 0;
+
+ data = &priv->data;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ break;
+ default:
+ break;
+ }
return 0;
}
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
+#include <linux/audit.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
#include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
char *prefix;
};
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+ if (!ih)
+ return false;
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+ &ih->saddr, &ih->daddr, ih->protocol);
+
+ return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ __be16 frag_off;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih)
+ return false;
+
+ nexthdr = ih->nexthdr;
+ ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+ struct sk_buff *skb = pkt->skb;
+ struct audit_buffer *ab;
+ int fam = -1;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "mark=%#x", skb->mark);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_BRIDGE:
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+ break;
+ case htons(ETH_P_IPV6):
+ fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+ break;
+ }
+ break;
+ case NFPROTO_IPV4:
+ fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+ break;
+ case NFPROTO_IPV6:
+ fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+ break;
+ }
+
+ if (fam == -1)
+ audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+ audit_log_end(ab);
+}
+
static void nft_log_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_log *priv = nft_expr_priv(expr);
+ if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+ priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+ nft_log_eval_audit(pkt);
+ return;
+ }
+
nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
} else {
li->u.log.level = LOGLEVEL_WARNING;
}
- if (li->u.log.level > LOGLEVEL_DEBUG) {
+ if (li->u.log.level > LOGLEVEL_AUDIT) {
err = -EINVAL;
goto err1;
}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
break;
}
+ if (li->u.log.level == LOGLEVEL_AUDIT)
+ return 0;
+
err = nf_logger_find_get(ctx->family, li->type);
if (err < 0)
goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
if (priv->prefix != nft_log_null_prefix)
kfree(priv->prefix);
+ if (li->u.log.level == LOGLEVEL_AUDIT)
+ return;
+
nf_logger_put(ctx->family, li->type);
}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ nla_put_failure:
return -1;
}
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_data *data;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ return nft_chain_validate(ctx, data->verdict.chain);
+ default:
+ return 0;
+ }
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **d)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+ struct nft_set_iter iter;
+
+ if (!(priv->set->flags & NFT_SET_MAP) ||
+ priv->set->dtype != NFT_DATA_VERDICT)
+ return 0;
+
+ iter.genmask = nft_genmask_next(ctx->net);
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nft_lookup_validate_setelem;
+
+ priv->set->ops->walk(ctx, priv->set, &iter);
+ if (iter.err < 0)
+ return iter.err;
+
+ return 0;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.init = nft_lookup_init,
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
+ .validate = nft_lookup_validate,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 204af9899482..1105a23bda5e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,7 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2014 Intel Corporation
+ * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -9,8 +11,6 @@
*/
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -24,21 +24,35 @@
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-#include <net/netfilter/nft_meta.h>
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+struct nft_meta {
+ enum nft_meta_keys key:8;
+ union {
+ enum nft_registers dreg:8;
+ enum nft_registers sreg:8;
+ };
+};
+
static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-void nft_meta_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+#ifdef CONFIG_NF_TABLES_BRIDGE
+#include "../bridge/br_private.h"
+#endif
+
+static void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ const struct net_bridge_port *p;
+#endif
switch (priv->key) {
case NFT_META_LEN:
@@ -215,6 +229,18 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, !!skb->sp);
break;
#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ case NFT_META_BRI_IIFNAME:
+ if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+ goto err;
+ strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+ return;
+ case NFT_META_BRI_OIFNAME:
+ if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+ goto err;
+ strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+ return;
+#endif
default:
WARN_ON(1);
goto err;
@@ -224,11 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
err:
regs->verdict.code = NFT_BREAK;
}
-EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-void nft_meta_set_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *meta = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
@@ -260,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr,
WARN_ON(1);
}
}
-EXPORT_SYMBOL_GPL(nft_meta_set_eval);
-const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
[NFTA_META_KEY] = { .type = NLA_U32 },
[NFTA_META_SREG] = { .type = NLA_U32 },
};
-EXPORT_SYMBOL_GPL(nft_meta_policy);
-int nft_meta_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -320,6 +343,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ if (ctx->family != NFPROTO_BRIDGE)
+ return -EOPNOTSUPP;
+ len = IFNAMSIZ;
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -328,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
return nft_validate_register_store(ctx, priv->dreg, NULL,
NFT_DATA_VALUE, len);
}
-EXPORT_SYMBOL_GPL(nft_meta_get_init);
static int nft_meta_get_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
@@ -362,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
#endif
}
-int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -390,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain, hooks);
}
-EXPORT_SYMBOL_GPL(nft_meta_set_validate);
-int nft_meta_set_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -426,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return 0;
}
-EXPORT_SYMBOL_GPL(nft_meta_set_init);
-int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -442,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-int nft_meta_set_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -459,19 +485,16 @@ int nft_meta_set_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_meta_set_dump);
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_meta_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
if (priv->key == NFT_META_NFTRACE)
static_branch_dec(&nft_trace_enabled);
}
-EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
-static struct nft_expr_type nft_meta_type;
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -510,27 +533,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
-static struct nft_expr_type nft_meta_type __read_mostly = {
+struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_meta_module_init(void)
-{
- return nft_register_expr(&nft_meta_type);
-}
-
-static void __exit nft_meta_module_exit(void)
-{
- nft_unregister_expr(&nft_meta_type);
-}
-
-module_init(nft_meta_module_init);
-module_exit(nft_meta_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
const struct nft_nat *priv = nft_expr_priv(expr);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
if (priv->sreg_addr_min) {
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..1f4d0854cf70 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
u32 modulus;
atomic_t counter;
u32 offset;
+ struct nft_set *map;
};
-static void nft_ng_inc_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
{
- struct nft_ng_inc *priv = nft_expr_priv(expr);
u32 nval, oval;
do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
- regs->data[priv->dreg] = nval + priv->offset;
+ return nval + priv->offset;
+}
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ regs->data[priv->dreg] = nft_ng_inc_gen(priv);
+}
+
+static void nft_ng_inc_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = nft_ng_inc_gen(priv);
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
}
static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
[NFTA_NG_MODULUS] = { .type = NLA_U32 },
[NFTA_NG_TYPE] = { .type = NLA_U32 },
[NFTA_NG_OFFSET] = { .type = NLA_U32 },
+ [NFTA_NG_SET_NAME] = { .type = NLA_STRING,
+ .len = NFT_SET_MAXNAMELEN - 1 },
+ [NFTA_NG_SET_ID] = { .type = NLA_U32 },
};
static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,22 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_inc *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_ng_inc_init(ctx, expr, tb);
+
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_NG_SET_NAME],
+ tb[NFTA_NG_SET_ID], genmask);
+
+ return PTR_ERR_OR_ZERO(priv->map);
+}
+
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
u32 modulus, enum nft_ng_types type, u32 offset)
{
@@ -97,22 +143,63 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static int nft_ng_inc_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+ NFT_NG_INCREMENTAL, priv->offset) ||
+ nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
struct nft_ng_random {
enum nft_registers dreg:8;
u32 modulus;
u32 offset;
+ struct nft_set *map;
};
+static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+{
+ struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+
+ return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
+ priv->offset;
+}
+
static void nft_ng_random_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
struct nft_ng_random *priv = nft_expr_priv(expr);
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
- u32 val;
- val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
- regs->data[priv->dreg] = val + priv->offset;
+ regs->data[priv->dreg] = nft_ng_random_gen(priv);
+}
+
+static void nft_ng_random_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+ const struct nft_set *map = priv->map;
+ const struct nft_set_ext *ext;
+ u32 result;
+ bool found;
+
+ result = nft_ng_random_gen(priv);
+ found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+ if (!found)
+ return;
+
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), map->dlen);
}
static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -139,6 +226,23 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
NFT_DATA_VALUE, sizeof(u32));
}
+static int nft_ng_random_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ng_random *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+
+ nft_ng_random_init(ctx, expr, tb);
+ priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+ tb[NFTA_NG_SET_NAME],
+ tb[NFTA_NG_SET_ID], genmask);
+ if (IS_ERR(priv->map))
+ return PTR_ERR(priv->map);
+
+ return 0;
+}
+
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -147,6 +251,22 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static int nft_ng_random_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+ NFT_NG_RANDOM, priv->offset) ||
+ nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
@@ -156,6 +276,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
.dump = nft_ng_inc_dump,
};
+static const struct nft_expr_ops nft_ng_inc_map_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+ .eval = nft_ng_inc_map_eval,
+ .init = nft_ng_inc_map_init,
+ .dump = nft_ng_inc_map_dump,
+};
+
static const struct nft_expr_ops nft_ng_random_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -164,6 +292,14 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.dump = nft_ng_random_dump,
};
+static const struct nft_expr_ops nft_ng_random_map_ops = {
+ .type = &nft_ng_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+ .eval = nft_ng_random_map_eval,
+ .init = nft_ng_random_map_init,
+ .dump = nft_ng_random_map_dump,
+};
+
static const struct nft_expr_ops *
nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
@@ -178,8 +314,12 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
switch (type) {
case NFT_NG_INCREMENTAL:
+ if (tb[NFTA_NG_SET_NAME])
+ return &nft_ng_inc_map_ops;
return &nft_ng_inc_ops;
case NFT_NG_RANDOM:
+ if (tb[NFTA_NG_SET_NAME])
+ return &nft_ng_random_map_ops;
return &nft_ng_random_ops;
}
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 0b02407773ad..cdf348f751ec 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
return -EINVAL;
objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
- obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
- genmask);
+ obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ genmask);
if (IS_ERR(obj))
return -ENOENT;
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 11a2071b6dd4..76dba9f6b6f6 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -7,8 +7,6 @@
*/
#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
return nft_chain_validate_hooks(ctx->chain, hooks);
}
-static struct nft_expr_type nft_rt_type;
static const struct nft_expr_ops nft_rt_get_ops = {
.type = &nft_rt_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_rt)),
@@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.validate = nft_rt_validate,
};
-static struct nft_expr_type nft_rt_type __read_mostly = {
+struct nft_expr_type nft_rt_type __read_mostly = {
.name = "rt",
.ops = &nft_rt_get_ops,
.policy = nft_rt_policy,
.maxattr = NFTA_RT_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_rt_module_init(void)
-{
- return nft_register_expr(&nft_rt_type);
-}
-
-static void __exit nft_rt_module_exit(void)
-{
- nft_unregister_expr(&nft_rt_type);
-}
-
-module_init(nft_rt_module_init);
-module_exit(nft_rt_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
-MODULE_ALIAS_NFT_EXPR("rt");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 45fb2752fb63..d6626e01c7ee 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_bitmap_type;
-static struct nft_set_ops nft_bitmap_ops __read_mostly = {
- .type = &nft_bitmap_type,
- .privsize = nft_bitmap_privsize,
- .elemsize = offsetof(struct nft_bitmap_elem, ext),
- .estimate = nft_bitmap_estimate,
- .init = nft_bitmap_init,
- .destroy = nft_bitmap_destroy,
- .insert = nft_bitmap_insert,
- .remove = nft_bitmap_remove,
- .deactivate = nft_bitmap_deactivate,
- .flush = nft_bitmap_flush,
- .activate = nft_bitmap_activate,
- .lookup = nft_bitmap_lookup,
- .walk = nft_bitmap_walk,
- .get = nft_bitmap_get,
-};
-
static struct nft_set_type nft_bitmap_type __read_mostly = {
- .ops = &nft_bitmap_ops,
.owner = THIS_MODULE,
+ .ops = {
+ .privsize = nft_bitmap_privsize,
+ .elemsize = offsetof(struct nft_bitmap_elem, ext),
+ .estimate = nft_bitmap_estimate,
+ .init = nft_bitmap_init,
+ .destroy = nft_bitmap_destroy,
+ .insert = nft_bitmap_insert,
+ .remove = nft_bitmap_remove,
+ .deactivate = nft_bitmap_deactivate,
+ .flush = nft_bitmap_flush,
+ .activate = nft_bitmap_activate,
+ .lookup = nft_bitmap_lookup,
+ .walk = nft_bitmap_walk,
+ .get = nft_bitmap_get,
+ },
};
static int __init nft_bitmap_module_init(void)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index fc9c6d5d64cd..6f9a1365a09f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work)
continue;
}
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
+ struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr))
+ goto gc;
+ }
if (!nft_set_elem_expired(&he->ext))
continue;
+gc:
if (nft_set_elem_mark_busy(&he->ext))
continue;
@@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
return sizeof(struct nft_rhash);
}
+static void nft_rhash_gc_init(const struct nft_set *set)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
static int nft_rhash_init(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const tb[])
@@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set,
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+ nft_rhash_gc_init(set);
+
return 0;
}
@@ -605,6 +621,12 @@ static void nft_hash_destroy(const struct nft_set *set)
static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
+ if (!desc->size)
+ return false;
+
+ if (desc->klen == 4)
+ return false;
+
est->size = sizeof(struct nft_hash) +
nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
desc->size * sizeof(struct nft_hash_elem);
@@ -614,91 +636,101 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_hash_type;
-static struct nft_set_ops nft_rhash_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_rhash_privsize,
- .elemsize = offsetof(struct nft_rhash_elem, ext),
- .estimate = nft_rhash_estimate,
- .init = nft_rhash_init,
- .destroy = nft_rhash_destroy,
- .insert = nft_rhash_insert,
- .activate = nft_rhash_activate,
- .deactivate = nft_rhash_deactivate,
- .flush = nft_rhash_flush,
- .remove = nft_rhash_remove,
- .lookup = nft_rhash_lookup,
- .update = nft_rhash_update,
- .walk = nft_rhash_walk,
- .get = nft_rhash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
-};
-
-static struct nft_set_ops nft_hash_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_hash_privsize,
- .elemsize = offsetof(struct nft_hash_elem, ext),
- .estimate = nft_hash_estimate,
- .init = nft_hash_init,
- .destroy = nft_hash_destroy,
- .insert = nft_hash_insert,
- .activate = nft_hash_activate,
- .deactivate = nft_hash_deactivate,
- .flush = nft_hash_flush,
- .remove = nft_hash_remove,
- .lookup = nft_hash_lookup,
- .walk = nft_hash_walk,
- .get = nft_hash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT,
-};
+static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!desc->size)
+ return false;
-static struct nft_set_ops nft_hash_fast_ops __read_mostly = {
- .type = &nft_hash_type,
- .privsize = nft_hash_privsize,
- .elemsize = offsetof(struct nft_hash_elem, ext),
- .estimate = nft_hash_estimate,
- .init = nft_hash_init,
- .destroy = nft_hash_destroy,
- .insert = nft_hash_insert,
- .activate = nft_hash_activate,
- .deactivate = nft_hash_deactivate,
- .flush = nft_hash_flush,
- .remove = nft_hash_remove,
- .lookup = nft_hash_lookup_fast,
- .walk = nft_hash_walk,
- .get = nft_hash_get,
- .features = NFT_SET_MAP | NFT_SET_OBJECT,
-};
+ if (desc->klen != 4)
+ return false;
-static const struct nft_set_ops *
-nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc,
- u32 flags)
-{
- if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) {
- switch (desc->klen) {
- case 4:
- return &nft_hash_fast_ops;
- default:
- return &nft_hash_ops;
- }
- }
+ est->size = sizeof(struct nft_hash) +
+ nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ desc->size * sizeof(struct nft_hash_elem);
+ est->lookup = NFT_SET_CLASS_O_1;
+ est->space = NFT_SET_CLASS_O_N;
- return &nft_rhash_ops;
+ return true;
}
+static struct nft_set_type nft_rhash_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT | NFT_SET_EVAL,
+ .ops = {
+ .privsize = nft_rhash_privsize,
+ .elemsize = offsetof(struct nft_rhash_elem, ext),
+ .estimate = nft_rhash_estimate,
+ .init = nft_rhash_init,
+ .gc_init = nft_rhash_gc_init,
+ .destroy = nft_rhash_destroy,
+ .insert = nft_rhash_insert,
+ .activate = nft_rhash_activate,
+ .deactivate = nft_rhash_deactivate,
+ .flush = nft_rhash_flush,
+ .remove = nft_rhash_remove,
+ .lookup = nft_rhash_lookup,
+ .update = nft_rhash_update,
+ .walk = nft_rhash_walk,
+ .get = nft_rhash_get,
+ },
+};
+
static struct nft_set_type nft_hash_type __read_mostly = {
- .select_ops = nft_hash_select_ops,
.owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT,
+ .ops = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .flush = nft_hash_flush,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup,
+ .walk = nft_hash_walk,
+ .get = nft_hash_get,
+ },
+};
+
+static struct nft_set_type nft_hash_fast_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_MAP | NFT_SET_OBJECT,
+ .ops = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_fast_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .flush = nft_hash_flush,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup_fast,
+ .walk = nft_hash_walk,
+ .get = nft_hash_get,
+ },
};
static int __init nft_hash_module_init(void)
{
- return nft_register_set(&nft_hash_type);
+ if (nft_register_set(&nft_hash_fast_type) ||
+ nft_register_set(&nft_hash_type) ||
+ nft_register_set(&nft_rhash_type))
+ return 1;
+ return 0;
}
static void __exit nft_hash_module_exit(void)
{
+ nft_unregister_set(&nft_rhash_type);
nft_unregister_set(&nft_hash_type);
+ nft_unregister_set(&nft_hash_fast_type);
}
module_init(nft_hash_module_init);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..d260ce2d6671 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -22,6 +22,7 @@ struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
seqcount_t count;
+ struct delayed_work gc_work;
};
struct nft_rbtree_elem {
@@ -265,6 +266,7 @@ static void nft_rbtree_activate(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext);
+ nft_set_elem_clear_busy(&rbe->ext);
}
static bool nft_rbtree_flush(const struct net *net,
@@ -272,8 +274,12 @@ static bool nft_rbtree_flush(const struct net *net,
{
struct nft_rbtree_elem *rbe = priv;
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
+ if (!nft_set_elem_mark_busy(&rbe->ext) ||
+ !nft_is_active(net, &rbe->ext)) {
+ nft_set_elem_change_active(net, set, &rbe->ext);
+ return true;
+ }
+ return false;
}
static void *nft_rbtree_deactivate(const struct net *net,
@@ -347,6 +353,62 @@ cont:
read_unlock_bh(&priv->lock);
}
+static void nft_rbtree_gc(struct work_struct *work)
+{
+ struct nft_set_gc_batch *gcb = NULL;
+ struct rb_node *node, *prev = NULL;
+ struct nft_rbtree_elem *rbe;
+ struct nft_rbtree *priv;
+ struct nft_set *set;
+ int i;
+
+ priv = container_of(work, struct nft_rbtree, gc_work.work);
+ set = nft_set_container_of(priv);
+
+ write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
+ for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ if (nft_rbtree_interval_end(rbe)) {
+ prev = node;
+ continue;
+ }
+ if (!nft_set_elem_expired(&rbe->ext))
+ continue;
+ if (nft_set_elem_mark_busy(&rbe->ext))
+ continue;
+
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (!gcb)
+ goto out;
+
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, rbe);
+
+ if (prev) {
+ rbe = rb_entry(prev, struct nft_rbtree_elem, node);
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, rbe);
+ }
+ node = rb_next(node);
+ }
+out:
+ if (gcb) {
+ for (i = 0; i < gcb->head.cnt; i++) {
+ rbe = gcb->elems[i];
+ rb_erase(&rbe->node, &priv->root);
+ }
+ }
+ write_seqcount_end(&priv->count);
+ write_unlock_bh(&priv->lock);
+
+ nft_set_gc_batch_complete(gcb);
+
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
@@ -362,6 +424,12 @@ static int nft_rbtree_init(const struct nft_set *set,
rwlock_init(&priv->lock);
seqcount_init(&priv->count);
priv->root = RB_ROOT;
+
+ INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
+ if (set->flags & NFT_SET_TIMEOUT)
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+
return 0;
}
@@ -371,6 +439,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
struct nft_rbtree_elem *rbe;
struct rb_node *node;
+ cancel_delayed_work_sync(&priv->gc_work);
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -393,28 +462,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_rbtree_type;
-static struct nft_set_ops nft_rbtree_ops __read_mostly = {
- .type = &nft_rbtree_type,
- .privsize = nft_rbtree_privsize,
- .elemsize = offsetof(struct nft_rbtree_elem, ext),
- .estimate = nft_rbtree_estimate,
- .init = nft_rbtree_init,
- .destroy = nft_rbtree_destroy,
- .insert = nft_rbtree_insert,
- .remove = nft_rbtree_remove,
- .deactivate = nft_rbtree_deactivate,
- .flush = nft_rbtree_flush,
- .activate = nft_rbtree_activate,
- .lookup = nft_rbtree_lookup,
- .walk = nft_rbtree_walk,
- .get = nft_rbtree_get,
- .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
static struct nft_set_type nft_rbtree_type __read_mostly = {
- .ops = &nft_rbtree_ops,
.owner = THIS_MODULE,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
+ .ops = {
+ .privsize = nft_rbtree_privsize,
+ .elemsize = offsetof(struct nft_rbtree_elem, ext),
+ .estimate = nft_rbtree_estimate,
+ .init = nft_rbtree_init,
+ .destroy = nft_rbtree_destroy,
+ .insert = nft_rbtree_insert,
+ .remove = nft_rbtree_remove,
+ .deactivate = nft_rbtree_deactivate,
+ .flush = nft_rbtree_flush,
+ .activate = nft_rbtree_activate,
+ .lookup = nft_rbtree_lookup,
+ .walk = nft_rbtree_walk,
+ .get = nft_rbtree_get,
+ },
};
static int __init nft_rbtree_module_init(void)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..f28a0b944087
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+
+struct nft_socket {
+ enum nft_socket_keys key:8;
+ union {
+ enum nft_registers dreg:8;
+ };
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ struct sock *sk = skb->sk;
+ u32 *dest = &regs->data[priv->dreg];
+
+ if (!sk)
+ switch(nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+ break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if(!sk) {
+ nft_reg_store8(dest, 0);
+ return;
+ }
+
+ /* So that subsequent socket matching not to require other lookups. */
+ skb->sk = sk;
+
+ switch(priv->key) {
+ case NFT_SOCKET_TRANSPARENT:
+ nft_reg_store8(dest, inet_sk_transparent(sk));
+ break;
+ default:
+ WARN_ON(1);
+ regs->verdict.code = NFT_BREAK;
+ }
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+ [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
+ [NFTA_SOCKET_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_socket *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+ return -EINVAL;
+
+ switch(ctx->family) {
+ case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+ case NFPROTO_IPV6:
+#endif
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ switch(priv->key) {
+ case NFT_SOCKET_TRANSPARENT:
+ len = sizeof(u8);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+
+ if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ return -1;
+ if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+ return -1;
+ return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+ .type = &nft_socket_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+ .eval = nft_socket_eval,
+ .init = nft_socket_init,
+ .dump = nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+ .name = "socket",
+ .ops = &nft_socket_ops,
+ .policy = nft_socket_policy,
+ .maxattr = NFTA_SOCKET_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+ return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+ nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
static unsigned int
netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
- struct nf_nat_range newrange;
+ const struct nf_nat_range2 *range = par->targinfo;
+ struct nf_nat_range2 newrange;
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
enum ip_conntrack_info ctinfo;
__be32 new_ip, netmask;
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range newrange;
+ struct nf_nat_range2 newrange;
WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_NFLOG.h>
#include <net/netfilter/nf_log.h>
-#include <net/netfilter/nfnetlink_log.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (info->flags & XT_NFLOG_F_COPY_LEN)
li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
- nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
- xt_in(par), xt_out(par), &li, info->prefix);
+ nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+ xt_out(par), &li, "%s", info->prefix);
+
return XT_CONTINUE;
}
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
- return 0;
+
+ return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+}
+
+static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
}
static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
.revision = 0,
.family = NFPROTO_UNSPEC,
.checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
.target = nflog_tg,
.targetsize = sizeof(struct xt_nflog_info),
.me = THIS_MODULE,
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range2 *range = par->targinfo;
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 8c89323c06af..58fce4e749a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,264 +33,9 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
+#include <net/netfilter/nf_tproxy.h>
#include <linux/netfilter/xt_TPROXY.h>
-enum nf_tproxy_lookup_t {
- NFT_LOOKUP_LISTENER,
- NFT_LOOKUP_ESTABLISHED,
-};
-
-static bool tproxy_sk_is_transparent(struct sock *sk)
-{
- switch (sk->sk_state) {
- case TCP_TIME_WAIT:
- if (inet_twsk(sk)->tw_transparent)
- return true;
- break;
- case TCP_NEW_SYN_RECV:
- if (inet_rsk(inet_reqsk(sk))->no_srccheck)
- return true;
- break;
- default:
- if (inet_sk(sk)->transparent)
- return true;
- }
-
- sock_gen_put(sk);
- return false;
-}
-
-static inline __be32
-tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
-{
- struct in_device *indev;
- __be32 laddr;
-
- if (user_laddr)
- return user_laddr;
-
- laddr = 0;
- indev = __in_dev_get_rcu(skb->dev);
- for_primary_ifa(indev) {
- laddr = ifa->ifa_local;
- break;
- } endfor_ifa(indev);
-
- return laddr ? laddr : daddr;
-}
-
-/*
- * This is used when the user wants to intercept a connection matching
- * an explicit iptables rule. In this case the sockets are assumed
- * matching in preference order:
- *
- * - match: if there's a fully established connection matching the
- * _packet_ tuple, it is returned, assuming the redirection
- * already took place and we process a packet belonging to an
- * established connection
- *
- * - match: if there's a listening socket matching the redirection
- * (e.g. on-port & on-ip of the connection), it is returned,
- * regardless if it was bound to 0.0.0.0 or an explicit
- * address. The reasoning is that if there's an explicit rule, it
- * does not really matter if the listener is bound to an interface
- * or to 0. The user already stated that he wants redirection
- * (since he added the rule).
- *
- * Please note that there's an overlap between what a TPROXY target
- * and a socket match will match. Normally if you have both rules the
- * "socket" match will be the first one, effectively all packets
- * belonging to established connections going through that one.
- */
-static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
- const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in,
- const enum nf_tproxy_lookup_t lookup_type)
-{
- struct sock *sk;
- struct tcphdr *tcph;
-
- switch (protocol) {
- case IPPROTO_TCP:
- switch (lookup_type) {
- case NFT_LOOKUP_LISTENER:
- tcph = hp;
- sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
- ip_hdrlen(skb) +
- __tcp_hdrlen(tcph),
- saddr, sport,
- daddr, dport,
- in->ifindex, 0);
-
- if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
- sk = NULL;
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- break;
- case NFT_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
- break;
- default:
- BUG();
- }
- break;
- case IPPROTO_UDP:
- sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- if (sk) {
- int connected = (sk->sk_state == TCP_ESTABLISHED);
- int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
-
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
- (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
- sock_put(sk);
- sk = NULL;
- }
- }
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
- protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
- return sk;
-}
-
-#ifdef XT_TPROXY_HAVE_IPV6
-static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
- const u8 protocol,
- const struct in6_addr *saddr, const struct in6_addr *daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in,
- const enum nf_tproxy_lookup_t lookup_type)
-{
- struct sock *sk;
- struct tcphdr *tcph;
-
- switch (protocol) {
- case IPPROTO_TCP:
- switch (lookup_type) {
- case NFT_LOOKUP_LISTENER:
- tcph = hp;
- sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
- thoff + __tcp_hdrlen(tcph),
- saddr, sport,
- daddr, ntohs(dport),
- in->ifindex, 0);
-
- if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
- sk = NULL;
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- break;
- case NFT_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, ntohs(dport),
- in->ifindex, 0);
- break;
- default:
- BUG();
- }
- break;
- case IPPROTO_UDP:
- sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- if (sk) {
- int connected = (sk->sk_state == TCP_ESTABLISHED);
- int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
-
- /* NOTE: we return listeners even if bound to
- * 0.0.0.0, those are filtered out in
- * xt_socket, since xt_TPROXY needs 0 bound
- * listeners too
- */
- if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
- (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
- sock_put(sk);
- sk = NULL;
- }
- }
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
- protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
-
- return sk;
-}
-#endif
-
-/**
- * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
- * @skb: The skb being processed.
- * @laddr: IPv4 address to redirect to or zero.
- * @lport: TCP port to redirect to or zero.
- * @sk: The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait4() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
- __be32 laddr, __be16 lport, struct sock *sk)
-{
- const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr _hdr, *hp;
-
- hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- inet_twsk_put(inet_twsk(sk));
- return NULL;
- }
-
- if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
- /* SYN to a TIME_WAIT socket, we'd rather redirect it
- * to a listener socket if there's one */
- struct sock *sk2;
-
- sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
- iph->saddr, laddr ? laddr : iph->daddr,
- hp->source, lport ? lport : hp->dest,
- skb->dev, NFT_LOOKUP_LISTENER);
- if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
- sk = sk2;
- }
- }
-
- return sk;
-}
-
/* assign a socket to the skb -- consumes sk */
static void
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
@@ -319,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
- skb->dev, NFT_LOOKUP_ESTABLISHED);
+ skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
- laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
if (!lport)
lport = hp->dest;
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
+ sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
- skb->dev, NFT_LOOKUP_LISTENER);
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && tproxy_sk_is_transparent(sk)) {
+ if (sk && nf_tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -377,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef XT_TPROXY_HAVE_IPV6
-static inline const struct in6_addr *
-tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
- const struct in6_addr *daddr)
-{
- struct inet6_dev *indev;
- struct inet6_ifaddr *ifa;
- struct in6_addr *laddr;
-
- if (!ipv6_addr_any(user_laddr))
- return user_laddr;
- laddr = NULL;
-
- indev = __in6_dev_get(skb->dev);
- if (indev) {
- read_lock_bh(&indev->lock);
- list_for_each_entry(ifa, &indev->addr_list, if_list) {
- if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
- continue;
-
- laddr = &ifa->addr;
- break;
- }
- read_unlock_bh(&indev->lock);
- }
-
- return laddr ? laddr : daddr;
-}
-
-/**
- * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb: The skb being processed.
- * @tproto: Transport protocol.
- * @thoff: Transport protocol header offset.
- * @par: Iptables target parameters.
- * @sk: The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
- const struct xt_action_param *par,
- struct sock *sk)
-{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct tcphdr _hdr, *hp;
- const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
- hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- inet_twsk_put(inet_twsk(sk));
- return NULL;
- }
-
- if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
- /* SYN to a TIME_WAIT socket, we'd rather redirect it
- * to a listener socket if there's one */
- struct sock *sk2;
-
- sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
- &iph->saddr,
- tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
- hp->source,
- tgi->lport ? tgi->lport : hp->dest,
- skb->dev, NFT_LOOKUP_LISTENER);
- if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
- sk = sk2;
- }
- }
-
- return sk;
-}
-
static unsigned int
tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -489,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
- xt_in(par), NFT_LOOKUP_ESTABLISHED);
+ xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
- laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
lport = tgi->lport ? tgi->lport : hp->dest;
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
- if (sk && sk->sk_state == TCP_TIME_WAIT)
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
/* reopening a TIME_WAIT connection needs special handling */
- sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff,
+ xt_net(par),
+ &tgi->laddr.in6,
+ tgi->lport,
+ sk);
+ }
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
tproto, &iph->saddr, laddr,
hp->source, lport,
- xt_in(par), NFT_LOOKUP_LISTENER);
+ xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && tproxy_sk_is_transparent(sk)) {
+ if (sk && nf_tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static void xt_nat_convert_range(struct nf_nat_range *dst,
+static void xt_nat_convert_range(struct nf_nat_range2 *dst,
const struct nf_nat_ipv4_range *src)
{
memset(&dst->min_addr, 0, sizeof(dst->min_addr));
memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+ memset(&dst->base_proto, 0, sizeof(dst->base_proto));
dst->flags = src->flags;
dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -71,7 +72,7 @@ static unsigned int
xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range range;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
static unsigned int
xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range *range_v1 = par->targinfo;
+ struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
(ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
- return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+ memcpy(&range, range_v1, sizeof(*range_v1));
+ memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
static unsigned int
xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct nf_nat_range *range = par->targinfo;
+ const struct nf_nat_range *range_v1 = par->targinfo;
+ struct nf_nat_range2 range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ WARN_ON(!(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+
+ memcpy(&range, range_v1, sizeof(*range_v1));
+ memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ WARN_ON(!(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY)));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT),
.me = THIS_MODULE,
},
+ {
+ .name = "SNAT",
+ .revision = 2,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
+ .target = xt_snat_target_v2,
+ .targetsize = sizeof(struct nf_nat_range2),
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 2,
+ .target = xt_dnat_target_v2,
+ .targetsize = sizeof(struct nf_nat_range2),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
};
static int __init xt_nat_init(void)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
#include <net/netfilter/nf_log.h>
#include <linux/netfilter/xt_osf.h>
-struct xt_osf_finger {
- struct rcu_head rcu_head;
- struct list_head finger_entry;
- struct xt_osf_user_finger finger;
-};
-
-enum osf_fmatch_states {
- /* Packet does not match the fingerprint */
- FMATCH_WRONG = 0,
- /* Packet matches the fingerprint */
- FMATCH_OK,
- /* Options do not match the fingerprint, but header does */
- FMATCH_OPT_WRONG,
-};
-
/*
* Indexed by dont-fragment bit.
* It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
.cb = xt_osf_nfnetlink_callbacks,
};
-static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
- unsigned char f_ttl)
-{
- const struct iphdr *ip = ip_hdr(skb);
-
- if (info->flags & XT_OSF_TTL) {
- if (info->ttl == XT_OSF_TTL_TRUE)
- return ip->ttl == f_ttl;
- if (info->ttl == XT_OSF_TTL_NOCHECK)
- return 1;
- else if (ip->ttl <= f_ttl)
- return 1;
- else {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
- int ret = 0;
-
- for_ifa(in_dev) {
- if (inet_ifa_match(ip->saddr, ifa)) {
- ret = (ip->ttl == f_ttl);
- break;
- }
- }
- endfor_ifa(in_dev);
-
- return ret;
- }
- }
-
- return ip->ttl == f_ttl;
-}
-
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
const struct xt_osf_info *info = p->matchinfo;
- const struct iphdr *ip = ip_hdr(skb);
- const struct tcphdr *tcp;
- struct tcphdr _tcph;
- int fmatch = FMATCH_WRONG, fcount = 0;
- unsigned int optsize = 0, check_WSS = 0;
- u16 window, totlen, mss = 0;
- bool df;
- const unsigned char *optp = NULL, *_optp = NULL;
- unsigned char opts[MAX_IPOPTLEN];
- const struct xt_osf_finger *kf;
- const struct xt_osf_user_finger *f;
struct net *net = xt_net(p);
if (!info)
return false;
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
- if (!tcp)
- return false;
-
- if (!tcp->syn)
- return false;
-
- totlen = ntohs(ip->tot_len);
- df = ntohs(ip->frag_off) & IP_DF;
- window = ntohs(tcp->window);
-
- if (tcp->doff * 4 > sizeof(struct tcphdr)) {
- optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
- _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
- sizeof(struct tcphdr), optsize, opts);
- }
-
- list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
- int foptsize, optnum;
-
- f = &kf->finger;
-
- if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
- continue;
-
- optp = _optp;
- fmatch = FMATCH_WRONG;
-
- if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
- continue;
-
- /*
- * Should not happen if userspace parser was written correctly.
- */
- if (f->wss.wc >= OSF_WSS_MAX)
- continue;
-
- /* Check options */
-
- foptsize = 0;
- for (optnum = 0; optnum < f->opt_num; ++optnum)
- foptsize += f->opt[optnum].length;
-
- if (foptsize > MAX_IPOPTLEN ||
- optsize > MAX_IPOPTLEN ||
- optsize != foptsize)
- continue;
-
- check_WSS = f->wss.wc;
-
- for (optnum = 0; optnum < f->opt_num; ++optnum) {
- if (f->opt[optnum].kind == (*optp)) {
- __u32 len = f->opt[optnum].length;
- const __u8 *optend = optp + len;
-
- fmatch = FMATCH_OK;
-
- switch (*optp) {
- case OSFOPT_MSS:
- mss = optp[3];
- mss <<= 8;
- mss |= optp[2];
-
- mss = ntohs((__force __be16)mss);
- break;
- case OSFOPT_TS:
- break;
- }
-
- optp = optend;
- } else
- fmatch = FMATCH_OPT_WRONG;
-
- if (fmatch != FMATCH_OK)
- break;
- }
-
- if (fmatch != FMATCH_OPT_WRONG) {
- fmatch = FMATCH_WRONG;
-
- switch (check_WSS) {
- case OSF_WSS_PLAIN:
- if (f->wss.val == 0 || window == f->wss.val)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MSS:
- /*
- * Some smart modems decrease mangle MSS to
- * SMART_MSS_2, so we check standard, decreased
- * and the one provided in the fingerprint MSS
- * values.
- */
-#define SMART_MSS_1 1460
-#define SMART_MSS_2 1448
- if (window == f->wss.val * mss ||
- window == f->wss.val * SMART_MSS_1 ||
- window == f->wss.val * SMART_MSS_2)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MTU:
- if (window == f->wss.val * (mss + 40) ||
- window == f->wss.val * (SMART_MSS_1 + 40) ||
- window == f->wss.val * (SMART_MSS_2 + 40))
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MODULO:
- if ((window % f->wss.val) == 0)
- fmatch = FMATCH_OK;
- break;
- }
- }
-
- if (fmatch != FMATCH_OK)
- continue;
-
- fcount++;
-
- if (info->flags & XT_OSF_LOG)
- nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
- xt_in(p), xt_out(p), NULL,
- "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
- f->genre, f->version, f->subtype,
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest),
- f->ttl - ip->ttl);
-
- if ((info->flags & XT_OSF_LOG) &&
- info->loglevel == XT_OSF_LOGLEVEL_FIRST)
- break;
- }
-
- if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
- xt_out(p), NULL,
- "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest));
-
- if (fcount)
- fmatch = FMATCH_OK;
-
- return fmatch == FMATCH_OK;
+ return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
+ xt_out(p), info, net, xt_osf_fingers);
}
static struct xt_match xt_osf_match = {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ac7f674d19b..5c0779c4fa3c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = nf_sk_is_transparent(sk);
+ transparent = inet_sk_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
@@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
* if XT_SOCKET_TRANSPARENT is used
*/
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = nf_sk_is_transparent(sk);
+ transparent = inet_sk_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f018eafc2a0d..376181cc1def 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -206,7 +206,6 @@ int nfc_genl_targets_found(struct nfc_dev *dev)
return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -237,7 +236,6 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -269,7 +267,6 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -299,7 +296,6 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -340,7 +336,6 @@ int nfc_genl_device_added(struct nfc_dev *dev)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -370,7 +365,6 @@ int nfc_genl_device_removed(struct nfc_dev *dev)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -434,8 +428,6 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
nla_put_failure:
- genlmsg_cancel(msg, hdr);
-
free_msg:
nlmsg_free(msg);
@@ -470,7 +462,6 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -501,7 +492,6 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -546,7 +536,6 @@ int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
/* evt_transaction is no more used */
devm_kfree(&dev->dev, evt_transaction);
@@ -585,7 +574,6 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -703,7 +691,6 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -735,7 +722,6 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -1030,7 +1016,6 @@ static int nfc_genl_send_params(struct sk_buff *msg,
return 0;
nla_put_failure:
-
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
@@ -1290,7 +1275,6 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
return 0;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
return -EMSGSIZE;
@@ -1507,7 +1491,6 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
return;
nla_put_failure:
- genlmsg_cancel(msg, hdr);
free_msg:
nlmsg_free(msg);
kfree(ctx);
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
(NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
(!NF_NAT || NF_NAT) && \
(!NF_NAT_IPV4 || NF_NAT_IPV4) && \
- (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+ (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+ (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..284aca2a252d 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
+#include <linux/static_key.h>
#include <net/ip.h>
+#include <net/genetlink.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -72,10 +75,35 @@ struct ovs_conntrack_info {
struct md_mark mark;
struct md_labels labels;
#ifdef CONFIG_NF_NAT_NEEDED
- struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
+ struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */
#endif
};
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED 0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+ /* Elements in ovs_ct_limit_info->limits hash table */
+ struct hlist_node hlist_node;
+ struct rcu_head rcu;
+ u16 zone;
+ u32 limit;
+};
+
+struct ovs_ct_limit_info {
+ u32 default_limit;
+ struct hlist_head *limits;
+ struct nf_conncount_data *data;
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+ [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
+};
+#endif
+
static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -710,7 +738,7 @@ static bool skb_nfct_cached(struct net *net,
*/
static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct nf_nat_range *range,
+ const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
int hooknum, nh_off, err = NF_ACCEPT;
@@ -1036,6 +1064,89 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
return false;
}
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+ const struct ovs_ct_limit_info *info, u16 zone)
+{
+ return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+ struct ovs_ct_limit *new_ct_limit)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+
+ head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ if (ct_limit->zone == new_ct_limit->zone) {
+ hlist_replace_rcu(&ct_limit->hlist_node,
+ &new_ct_limit->hlist_node);
+ kfree_rcu(ct_limit, rcu);
+ return;
+ }
+ }
+
+ hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+ struct hlist_node *n;
+
+ head = ct_limit_hash_bucket(info, zone);
+ hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) {
+ if (ct_limit->zone == zone) {
+ hlist_del_rcu(&ct_limit->hlist_node);
+ kfree_rcu(ct_limit, rcu);
+ return;
+ }
+ }
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+
+ head = ct_limit_hash_bucket(info, zone);
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ if (ct_limit->zone == zone)
+ return ct_limit->limit;
+ }
+
+ return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+ const struct ovs_conntrack_info *info,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ u32 per_zone_limit, connections;
+ u32 conncount_key;
+
+ conncount_key = info->zone.id;
+
+ per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+ if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
+ return 0;
+
+ connections = nf_conncount_count(net, ct_limit_info->data,
+ &conncount_key, tuple, &info->zone);
+ if (connections > per_zone_limit)
+ return -ENOMEM;
+
+ return 0;
+}
+#endif
+
/* Lookup connection and confirm if unconfirmed. */
static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -1054,6 +1165,21 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
if (!ct)
return 0;
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+ if (!nf_ct_is_confirmed(ct)) {
+ err = ovs_ct_check_limit(net, info,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ if (err) {
+ net_warn_ratelimited("openvswitch: zone: %u "
+ "execeeds conntrack limit\n",
+ info->zone.id);
+ return err;
+ }
+ }
+ }
+#endif
+
/* Set the conntrack event mask if given. NEW and DELETE events have
* their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
* typically would receive many kinds of updates. Setting the event
@@ -1655,7 +1781,420 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
nf_ct_tmpl_free(ct_info->ct);
}
-void ovs_ct_init(struct net *net)
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+ int i, err;
+
+ ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
+ GFP_KERNEL);
+ if (!ovs_net->ct_limit_info)
+ return -ENOMEM;
+
+ ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+ ovs_net->ct_limit_info->limits =
+ kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!ovs_net->ct_limit_info->limits) {
+ kfree(ovs_net->ct_limit_info);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+ ovs_net->ct_limit_info->data =
+ nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+ if (IS_ERR(ovs_net->ct_limit_info->data)) {
+ err = PTR_ERR(ovs_net->ct_limit_info->data);
+ kfree(ovs_net->ct_limit_info->limits);
+ kfree(ovs_net->ct_limit_info);
+ pr_err("openvswitch: failed to init nf_conncount %d\n", err);
+ return err;
+ }
+ return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+ const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+ int i;
+
+ nf_conncount_destroy(net, NFPROTO_INET, info->data);
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+ struct hlist_head *head = &info->limits[i];
+ struct ovs_ct_limit *ct_limit;
+
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+ kfree_rcu(ct_limit, rcu);
+ }
+ kfree(ovs_net->ct_limit_info->limits);
+ kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+ struct ovs_header **ovs_reply_header)
+{
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sk_buff *skb;
+
+ skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ *ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+ info->snd_seq,
+ &dp_ct_limit_genl_family, 0, cmd);
+
+ if (!*ovs_reply_header) {
+ nlmsg_free(skb);
+ return ERR_PTR(-EMSGSIZE);
+ }
+ (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+ return skb;
+}
+
+static bool check_zone_id(int zone_id, u16 *pzone)
+{
+ if (zone_id >= 0 && zone_id <= 65535) {
+ *pzone = (u16)zone_id;
+ return true;
+ }
+ return false;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info)
+{
+ struct ovs_zone_limit *zone_limit;
+ int rem;
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(zone_limit->zone_id ==
+ OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+ ovs_lock();
+ info->default_limit = zone_limit->limit;
+ ovs_unlock();
+ } else if (unlikely(!check_zone_id(
+ zone_limit->zone_id, &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ struct ovs_ct_limit *ct_limit;
+
+ ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+ if (!ct_limit)
+ return -ENOMEM;
+
+ ct_limit->zone = zone;
+ ct_limit->limit = zone_limit->limit;
+
+ ovs_lock();
+ ct_limit_set(info, ct_limit);
+ ovs_unlock();
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "set zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info)
+{
+ struct ovs_zone_limit *zone_limit;
+ int rem;
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(zone_limit->zone_id ==
+ OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+ ovs_lock();
+ info->default_limit = OVS_CT_LIMIT_DEFAULT;
+ ovs_unlock();
+ } else if (unlikely(!check_zone_id(
+ zone_limit->zone_id, &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ ovs_lock();
+ ct_limit_del(info, zone);
+ ovs_unlock();
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct ovs_zone_limit zone_limit;
+ int err;
+
+ zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
+ zone_limit.limit = info->default_limit;
+ err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __ovs_ct_limit_get_zone_limit(struct net *net,
+ struct nf_conncount_data *data,
+ u16 zone_id, u32 limit,
+ struct sk_buff *reply)
+{
+ struct nf_conntrack_zone ct_zone;
+ struct ovs_zone_limit zone_limit;
+ u32 conncount_key = zone_id;
+
+ zone_limit.zone_id = zone_id;
+ zone_limit.limit = limit;
+ nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
+
+ zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL,
+ &ct_zone);
+ return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+ struct nlattr *nla_zone_limit,
+ struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct ovs_zone_limit *zone_limit;
+ int rem, err;
+ u32 limit;
+ u16 zone;
+
+ rem = NLA_ALIGN(nla_len(nla_zone_limit));
+ zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+ while (rem >= sizeof(*zone_limit)) {
+ if (unlikely(zone_limit->zone_id ==
+ OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+ err = ovs_ct_limit_get_default_limit(info, reply);
+ if (err)
+ return err;
+ } else if (unlikely(!check_zone_id(zone_limit->zone_id,
+ &zone))) {
+ OVS_NLERR(true, "zone id is out of range");
+ } else {
+ rcu_read_lock();
+ limit = ct_limit_get(info, zone);
+ rcu_read_unlock();
+
+ err = __ovs_ct_limit_get_zone_limit(
+ net, info->data, zone, limit, reply);
+ if (err)
+ return err;
+ }
+ rem -= NLA_ALIGN(sizeof(*zone_limit));
+ zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+ NLA_ALIGN(sizeof(*zone_limit)));
+ }
+
+ if (rem)
+ OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
+
+ return 0;
+}
+
+static int ovs_ct_limit_get_all_zone_limit(struct net *net,
+ struct ovs_ct_limit_info *info,
+ struct sk_buff *reply)
+{
+ struct ovs_ct_limit *ct_limit;
+ struct hlist_head *head;
+ int i, err = 0;
+
+ err = ovs_ct_limit_get_default_limit(info, reply);
+ if (err)
+ return err;
+
+ rcu_read_lock();
+ for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+ head = &info->limits[i];
+ hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+ err = __ovs_ct_limit_get_zone_limit(net, info->data,
+ ct_limit->zone, ct_limit->limit, reply);
+ if (err)
+ goto exit_err;
+ }
+ }
+
+exit_err:
+ rcu_read_unlock();
+ return err;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = -EINVAL;
+ goto exit_err;
+ }
+
+ err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+ ct_limit_info);
+ if (err)
+ goto exit_err;
+
+ static_branch_enable(&ovs_ct_limit_enabled);
+
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = -EINVAL;
+ goto exit_err;
+ }
+
+ err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+ ct_limit_info);
+ if (err)
+ goto exit_err;
+
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct nlattr *nla_reply;
+ struct sk_buff *reply;
+ struct ovs_header *ovs_reply_header;
+ struct net *net = sock_net(skb->sk);
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+ int err;
+
+ reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+ &ovs_reply_header);
+ if (IS_ERR(reply))
+ return PTR_ERR(reply);
+
+ nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+
+ if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+ err = ovs_ct_limit_get_zone_limit(
+ net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info,
+ reply);
+ if (err)
+ goto exit_err;
+ } else {
+ err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info,
+ reply);
+ if (err)
+ goto exit_err;
+ }
+
+ nla_nest_end(reply, nla_reply);
+ genlmsg_end(reply, ovs_reply_header);
+ return genlmsg_reply(reply, info);
+
+exit_err:
+ nlmsg_free(reply);
+ return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+ { .cmd = OVS_CT_LIMIT_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_set,
+ },
+ { .cmd = OVS_CT_LIMIT_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_del,
+ },
+ { .cmd = OVS_CT_LIMIT_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = ct_limit_policy,
+ .doit = ovs_ct_limit_cmd_get,
+ },
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+ .name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_CT_LIMIT_FAMILY,
+ .version = OVS_CT_LIMIT_VERSION,
+ .maxattr = OVS_CT_LIMIT_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ct_limit_genl_ops,
+ .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .mcgrps = &ovs_ct_limit_multicast_group,
+ .n_mcgrps = 1,
+ .module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
{
unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2205,22 @@ void ovs_ct_init(struct net *net)
} else {
ovs_net->xt_label = true;
}
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ return ovs_ct_limit_init(net, ovs_net);
+#else
+ return 0;
+#endif
}
void ovs_ct_exit(struct net *net)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ ovs_ct_limit_exit(net, ovs_net);
+#endif
+
if (ovs_net->xt_label)
nf_connlabels_put(net);
}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
#include "flow.h"
struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
enum ovs_key_attr;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
void ovs_ct_exit(struct net *);
bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
#else
#include <linux/errno.h>
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
static inline void ovs_ct_exit(struct net *net) { }
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
#define CT_SUPPORTED_MASK 0
#endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
#endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
&dp_flow_genl_family,
&dp_packet_genl_family,
&dp_meter_genl_family,
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ &dp_ct_limit_genl_family,
+#endif
};
static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
- ovs_ct_init(net);
- return 0;
+ return ovs_ct_init(net);
}
static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..c9eb267c6f7e 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,9 @@ struct dp_upcall_info {
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ struct ovs_ct_limit_info *ct_limit_info;
+#endif
/* Module reference for configuring conntrack. */
bool xt_label;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 674390b1f084..54ce66f68482 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
struct tpacket3_hdr *);
static void packet_flush_mclist(struct sock *sk);
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
+static u16 packet_pick_tx_queue(struct sk_buff *skb);
struct packet_skb_cb {
union {
@@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
static int packet_direct_xmit(struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
- struct sk_buff *orig_skb = skb;
- struct netdev_queue *txq;
- int ret = NETDEV_TX_BUSY;
- bool again = false;
-
- if (unlikely(!netif_running(dev) ||
- !netif_carrier_ok(dev)))
- goto drop;
-
- skb = validate_xmit_skb_list(skb, dev, &again);
- if (skb != orig_skb)
- goto drop;
-
- packet_pick_tx_queue(dev, skb);
- txq = skb_get_tx_queue(dev, skb);
-
- local_bh_disable();
-
- HARD_TX_LOCK(dev, txq, smp_processor_id());
- if (!netif_xmit_frozen_or_drv_stopped(txq))
- ret = netdev_start_xmit(skb, dev, txq, false);
- HARD_TX_UNLOCK(dev, txq);
-
- local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
- return ret;
-drop:
- atomic_long_inc(&dev->tx_dropped);
- kfree_skb_list(skb);
- return NET_XMIT_DROP;
+ return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
}
static struct net_device *packet_cached_dev_get(struct packet_sock *po)
@@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
}
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 packet_pick_tx_queue(struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
const struct net_device_ops *ops = dev->netdev_ops;
u16 queue_index;
@@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
queue_index = __packet_pick_tx_queue(dev, skb);
}
- skb_set_queue_mapping(skb, queue_index);
+ return queue_index;
}
/* __register_prot_hook must be invoked through register_prot_hook
@@ -4281,7 +4249,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
goto out;
if (po->tp_version >= TPACKET_V3 &&
req->tp_block_size <=
- BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
+ BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr))
goto out;
if (unlikely(req->tp_frame_size < po->tp_hdrlen +
po->tp_reserve))
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
Say Y here to support SMD based ipcrouter channels. SMD is the
most common transport for IPC Router.
+config QRTR_TUN
+ tristate "TUN device for Qualcomm IPC Router"
+ ---help---
+ Say Y here to expose a character device that allows user space to
+ implement endpoints of QRTR, for purpose of tunneling data to other
+ hosts or testing purposes.
+
endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
qrtr-smd-y := smd.o
+obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
+qrtr-tun-y := tun.o
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..ccff1e544c21
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Linaro Ltd */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/uaccess.h>
+
+#include "qrtr.h"
+
+struct qrtr_tun {
+ struct qrtr_endpoint ep;
+
+ struct sk_buff_head queue;
+ wait_queue_head_t readq;
+};
+
+static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+ struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
+
+ skb_queue_tail(&tun->queue, skb);
+
+ /* wake up any blocking processes, waiting for new data */
+ wake_up_interruptible(&tun->readq);
+
+ return 0;
+}
+
+static int qrtr_tun_open(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun;
+
+ tun = kzalloc(sizeof(*tun), GFP_KERNEL);
+ if (!tun)
+ return -ENOMEM;
+
+ skb_queue_head_init(&tun->queue);
+ init_waitqueue_head(&tun->readq);
+
+ tun->ep.xmit = qrtr_tun_send;
+
+ filp->private_data = tun;
+
+ return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
+}
+
+static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+ int count;
+
+ while (!(skb = skb_dequeue(&tun->queue))) {
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ /* Wait until we get data or the endpoint goes away */
+ if (wait_event_interruptible(tun->readq,
+ !skb_queue_empty(&tun->queue)))
+ return -ERESTARTSYS;
+ }
+
+ count = min_t(size_t, iov_iter_count(to), skb->len);
+ if (copy_to_iter(skb->data, count, to) != count)
+ count = -EFAULT;
+
+ kfree_skb(skb);
+
+ return count;
+}
+
+static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ size_t len = iov_iter_count(from);
+ ssize_t ret;
+ void *kbuf;
+
+ kbuf = kzalloc(len, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (!copy_from_iter_full(kbuf, len, from))
+ return -EFAULT;
+
+ ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+
+ return ret < 0 ? ret : len;
+}
+
+static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait)
+{
+ struct qrtr_tun *tun = filp->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(filp, &tun->readq, wait);
+
+ if (!skb_queue_empty(&tun->queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static int qrtr_tun_release(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+
+ qrtr_endpoint_unregister(&tun->ep);
+
+ /* Discard all SKBs */
+ while (!skb_queue_empty(&tun->queue)) {
+ skb = skb_dequeue(&tun->queue);
+ kfree_skb(skb);
+ }
+
+ kfree(tun);
+
+ return 0;
+}
+
+static const struct file_operations qrtr_tun_ops = {
+ .owner = THIS_MODULE,
+ .open = qrtr_tun_open,
+ .poll = qrtr_tun_poll,
+ .read_iter = qrtr_tun_read_iter,
+ .write_iter = qrtr_tun_write_iter,
+ .release = qrtr_tun_release,
+};
+
+static struct miscdevice qrtr_tun_miscdev = {
+ MISC_DYNAMIC_MINOR,
+ "qrtr-tun",
+ &qrtr_tun_ops,
+};
+
+static int __init qrtr_tun_init(void)
+{
+ int ret;
+
+ ret = misc_register(&qrtr_tun_miscdev);
+ if (ret)
+ pr_err("failed to register Qualcomm IPC Router tun device\n");
+
+ return ret;
+}
+
+static void __exit qrtr_tun_exit(void)
+{
+ misc_deregister(&qrtr_tun_miscdev);
+}
+
+module_init(qrtr_tun_init);
+module_exit(qrtr_tun_exit);
+
+MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
+MODULE_LICENSE("GPL v2");
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 59d0eb960275..a7a4e6ff9be2 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -178,9 +178,10 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
}
static struct led_trigger rfkill_any_led_trigger;
-static struct work_struct rfkill_any_work;
+static struct led_trigger rfkill_none_led_trigger;
+static struct work_struct rfkill_global_led_trigger_work;
-static void rfkill_any_led_trigger_worker(struct work_struct *work)
+static void rfkill_global_led_trigger_worker(struct work_struct *work)
{
enum led_brightness brightness = LED_OFF;
struct rfkill *rfkill;
@@ -195,30 +196,43 @@ static void rfkill_any_led_trigger_worker(struct work_struct *work)
mutex_unlock(&rfkill_global_mutex);
led_trigger_event(&rfkill_any_led_trigger, brightness);
+ led_trigger_event(&rfkill_none_led_trigger,
+ brightness == LED_OFF ? LED_FULL : LED_OFF);
}
-static void rfkill_any_led_trigger_event(void)
+static void rfkill_global_led_trigger_event(void)
{
- schedule_work(&rfkill_any_work);
+ schedule_work(&rfkill_global_led_trigger_work);
}
-static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+static int rfkill_global_led_trigger_register(void)
{
- rfkill_any_led_trigger_event();
-}
+ int ret;
+
+ INIT_WORK(&rfkill_global_led_trigger_work,
+ rfkill_global_led_trigger_worker);
-static int rfkill_any_led_trigger_register(void)
-{
- INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
rfkill_any_led_trigger.name = "rfkill-any";
- rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
- return led_trigger_register(&rfkill_any_led_trigger);
+ ret = led_trigger_register(&rfkill_any_led_trigger);
+ if (ret)
+ return ret;
+
+ rfkill_none_led_trigger.name = "rfkill-none";
+ ret = led_trigger_register(&rfkill_none_led_trigger);
+ if (ret)
+ led_trigger_unregister(&rfkill_any_led_trigger);
+ else
+ /* Delay activation until all global triggers are registered */
+ rfkill_global_led_trigger_event();
+
+ return ret;
}
-static void rfkill_any_led_trigger_unregister(void)
+static void rfkill_global_led_trigger_unregister(void)
{
+ led_trigger_unregister(&rfkill_none_led_trigger);
led_trigger_unregister(&rfkill_any_led_trigger);
- cancel_work_sync(&rfkill_any_work);
+ cancel_work_sync(&rfkill_global_led_trigger_work);
}
#else
static void rfkill_led_trigger_event(struct rfkill *rfkill)
@@ -234,16 +248,16 @@ static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
{
}
-static void rfkill_any_led_trigger_event(void)
+static void rfkill_global_led_trigger_event(void)
{
}
-static int rfkill_any_led_trigger_register(void)
+static int rfkill_global_led_trigger_register(void)
{
return 0;
}
-static void rfkill_any_led_trigger_unregister(void)
+static void rfkill_global_led_trigger_unregister(void)
{
}
#endif /* CONFIG_RFKILL_LEDS */
@@ -354,7 +368,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
spin_unlock_irqrestore(&rfkill->lock, flags);
rfkill_led_trigger_event(rfkill);
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
if (prev != curr)
rfkill_event(rfkill);
@@ -535,7 +549,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
spin_unlock_irqrestore(&rfkill->lock, flags);
rfkill_led_trigger_event(rfkill);
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
if (rfkill->registered && prev != blocked)
schedule_work(&rfkill->uevent_work);
@@ -579,7 +593,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
schedule_work(&rfkill->uevent_work);
rfkill_led_trigger_event(rfkill);
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
return blocked;
}
@@ -629,7 +643,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
schedule_work(&rfkill->uevent_work);
rfkill_led_trigger_event(rfkill);
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
}
}
EXPORT_SYMBOL(rfkill_set_states);
@@ -1046,7 +1060,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
#endif
}
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
rfkill_send_events(rfkill, RFKILL_OP_ADD);
mutex_unlock(&rfkill_global_mutex);
@@ -1079,7 +1093,7 @@ void rfkill_unregister(struct rfkill *rfkill)
mutex_lock(&rfkill_global_mutex);
rfkill_send_events(rfkill, RFKILL_OP_DEL);
list_del_init(&rfkill->node);
- rfkill_any_led_trigger_event();
+ rfkill_global_led_trigger_event();
mutex_unlock(&rfkill_global_mutex);
rfkill_led_trigger_unregister(rfkill);
@@ -1332,7 +1346,7 @@ static int __init rfkill_init(void)
if (error)
goto error_misc;
- error = rfkill_any_led_trigger_register();
+ error = rfkill_global_led_trigger_register();
if (error)
goto error_led_trigger;
@@ -1346,7 +1360,7 @@ static int __init rfkill_init(void)
#ifdef CONFIG_RFKILL_INPUT
error_input:
- rfkill_any_led_trigger_unregister();
+ rfkill_global_led_trigger_unregister();
#endif
error_led_trigger:
misc_deregister(&rfkill_miscdev);
@@ -1362,7 +1376,7 @@ static void __exit rfkill_exit(void)
#ifdef CONFIG_RFKILL_INPUT
rfkill_handler_exit();
#endif
- rfkill_any_led_trigger_unregister();
+ rfkill_global_led_trigger_unregister();
misc_deregister(&rfkill_miscdev);
class_unregister(&rfkill_class);
}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 29923ec2189c..5fb7d3254d9e 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -477,6 +477,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_PINGING, /* Ping in process */
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
+ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
};
/*
@@ -624,6 +625,7 @@ struct rxrpc_call {
*/
rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */
rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */
+ rxrpc_serial_t rx_serial; /* Highest serial received for this call */
u8 rx_winsize; /* Size of Rx window */
u8 tx_winsize; /* Maximum size of Tx window */
bool tx_phase; /* T if transmission phase, F if receive phase */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6e0d788b4dc4..20210418904b 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -392,7 +392,13 @@ recheck_state:
/* Process events */
if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
- rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+ if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
+ (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
+ trace_rxrpc_call_reset(call);
+ rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET);
+ } else {
+ rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+ }
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
goto recheck_state;
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 1350f1be8037..8229a52c2acd 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -70,7 +70,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
iov[2].iov_len = sizeof(ack_info);
pkt.whdr.epoch = htonl(conn->proto.epoch);
- pkt.whdr.cid = htonl(conn->proto.cid);
+ pkt.whdr.cid = htonl(conn->proto.cid | channel);
pkt.whdr.callNumber = htonl(call_id);
pkt.whdr.seq = 0;
pkt.whdr.type = chan->last_type;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b5fd6381313d..608d078a4981 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk)
call = NULL;
}
- if (call && sp->hdr.serviceId != call->service_id)
- call->service_id = sp->hdr.serviceId;
+ if (call) {
+ if (sp->hdr.serviceId != call->service_id)
+ call->service_id = sp->hdr.serviceId;
+ if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
+ call->rx_serial = sp->hdr.serial;
+ if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
+ set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
+ }
} else {
skew = 0;
call = NULL;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 72251241665a..3f4cf930f809 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -77,9 +77,9 @@ static void free_tcf(struct tc_action *p)
static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
{
- spin_lock_bh(&idrinfo->lock);
+ spin_lock(&idrinfo->lock);
idr_remove(&idrinfo->action_idr, p->tcfa_index);
- spin_unlock_bh(&idrinfo->lock);
+ spin_unlock(&idrinfo->lock);
gen_kill_estimator(&p->tcfa_rate_est);
free_tcf(p);
}
@@ -156,7 +156,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct tc_action *p;
unsigned long id = 1;
- spin_lock_bh(&idrinfo->lock);
+ spin_lock(&idrinfo->lock);
s_i = cb->args[0];
@@ -191,7 +191,7 @@ done:
if (index >= 0)
cb->args[0] = index + 1;
- spin_unlock_bh(&idrinfo->lock);
+ spin_unlock(&idrinfo->lock);
if (n_i) {
if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
cb->args[1] = n_i;
@@ -261,9 +261,9 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
{
struct tc_action *p = NULL;
- spin_lock_bh(&idrinfo->lock);
+ spin_lock(&idrinfo->lock);
p = idr_find(&idrinfo->action_idr, index);
- spin_unlock_bh(&idrinfo->lock);
+ spin_unlock(&idrinfo->lock);
return p;
}
@@ -323,7 +323,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
}
spin_lock_init(&p->tcfa_lock);
idr_preload(GFP_KERNEL);
- spin_lock_bh(&idrinfo->lock);
+ spin_lock(&idrinfo->lock);
/* user doesn't specify an index */
if (!index) {
index = 1;
@@ -331,7 +331,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
} else {
err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
}
- spin_unlock_bh(&idrinfo->lock);
+ spin_unlock(&idrinfo->lock);
idr_preload_end();
if (err)
goto err3;
@@ -369,9 +369,9 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- spin_lock_bh(&idrinfo->lock);
+ spin_lock(&idrinfo->lock);
idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
- spin_unlock_bh(&idrinfo->lock);
+ spin_unlock(&idrinfo->lock);
}
EXPORT_SYMBOL(tcf_idr_insert);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7e28b2ce1437..526a8e491626 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -648,6 +648,11 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static size_t tcf_csum_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_csum));
+}
+
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
@@ -658,6 +663,7 @@ static struct tc_action_ops act_csum_ops = {
.cleanup = tcf_csum_cleanup,
.walk = tcf_csum_walker,
.lookup = tcf_csum_search,
+ .get_fill_size = tcf_csum_get_fill_size,
.size = sizeof(struct tcf_csum),
};
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index a57e112d9b3e..cdc3c87c53e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
-bool tcf_queue_work(struct work_struct *work)
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
{
- return queue_work(tc_filter_wq, work);
+ INIT_RCU_WORK(rwork, func);
+ return queue_rcu_work(tc_filter_wq, rwork);
}
EXPORT_SYMBOL(tcf_queue_work);
@@ -436,6 +437,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
return idr_find(&tn->idr, block_index);
}
+/* Find tcf block.
+ * Set q, parent, cl when appropriate.
+ */
+
+static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
+ u32 *parent, unsigned long *cl,
+ int ifindex, u32 block_index,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_block *block;
+
+ if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_lookup(net, block_index);
+ if (!block) {
+ NL_SET_ERR_MSG(extack, "Block of given index was not found");
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+
+ /* Find link */
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ /* Find qdisc */
+ if (!*parent) {
+ *q = dev->qdisc;
+ *parent = (*q)->handle;
+ } else {
+ *q = qdisc_lookup(dev, TC_H_MAJ(*parent));
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ /* Is it classful? */
+ cops = (*q)->ops->cl_ops;
+ if (!cops) {
+ NL_SET_ERR_MSG(extack, "Qdisc not classful");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!cops->tcf_block) {
+ NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Do we search for filter, attached to class? */
+ if (TC_H_MIN(*parent)) {
+ *cl = cops->find(*q, *parent);
+ if (*cl == 0) {
+ NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
+ return ERR_PTR(-ENOENT);
+ }
+ }
+
+ /* And the last stroke */
+ block = cops->tcf_block(*q, *cl, extack);
+ if (!block)
+ return ERR_PTR(-EINVAL);
+ if (tcf_block_shared(block)) {
+ NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ }
+
+ return block;
+}
+
static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
{
return list_first_entry(&block->chain_list, struct tcf_chain, list);
@@ -983,9 +1056,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
q, parent, 0, event, false);
}
-/* Add/change/delete/get a filter node */
-
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
@@ -1006,8 +1077,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
int err;
int tp_created;
- if ((n->nlmsg_type != RTM_GETTFILTER) &&
- !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
replay:
@@ -1025,24 +1095,13 @@ replay:
cl = 0;
if (prio == 0) {
- switch (n->nlmsg_type) {
- case RTM_DELTFILTER:
- if (protocol || t->tcm_handle || tca[TCA_KIND]) {
- NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
- return -ENOENT;
- }
- break;
- case RTM_NEWTFILTER:
- /* If no priority is provided by the user,
- * we allocate one.
- */
- if (n->nlmsg_flags & NLM_F_CREATE) {
- prio = TC_H_MAKE(0x80000000U, 0U);
- prio_allocate = true;
- break;
- }
- /* fall-through */
- default:
+ /* If no priority is provided by the user,
+ * we allocate one.
+ */
+ if (n->nlmsg_flags & NLM_F_CREATE) {
+ prio = TC_H_MAKE(0x80000000U, 0U);
+ prio_allocate = true;
+ } else {
NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
return -ENOENT;
}
@@ -1050,66 +1109,11 @@ replay:
/* Find head of filter chain. */
- if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, t->tcm_block_index);
- if (!block) {
- NL_SET_ERR_MSG(extack, "Block of given index was not found");
- err = -EINVAL;
- goto errout;
- }
- } else {
- const struct Qdisc_class_ops *cops;
- struct net_device *dev;
-
- /* Find link */
- dev = __dev_get_by_index(net, t->tcm_ifindex);
- if (!dev)
- return -ENODEV;
-
- /* Find qdisc */
- if (!parent) {
- q = dev->qdisc;
- parent = q->handle;
- } else {
- q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
- if (!q) {
- NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
- return -EINVAL;
- }
- }
-
- /* Is it classful? */
- cops = q->ops->cl_ops;
- if (!cops) {
- NL_SET_ERR_MSG(extack, "Qdisc not classful");
- return -EINVAL;
- }
-
- if (!cops->tcf_block) {
- NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
- return -EOPNOTSUPP;
- }
-
- /* Do we search for filter, attached to class? */
- if (TC_H_MIN(parent)) {
- cl = cops->find(q, parent);
- if (cl == 0) {
- NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
- return -ENOENT;
- }
- }
-
- /* And the last stroke */
- block = cops->tcf_block(q, cl, extack);
- if (!block) {
- err = -EINVAL;
- goto errout;
- }
- if (tcf_block_shared(block)) {
- NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
- err = -EOPNOTSUPP;
- goto errout;
- }
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
}
chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
@@ -1118,19 +1122,10 @@ replay:
err = -EINVAL;
goto errout;
}
- chain = tcf_chain_get(block, chain_index,
- n->nlmsg_type == RTM_NEWTFILTER);
+ chain = tcf_chain_get(block, chain_index, true);
if (!chain) {
NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
- err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL;
- goto errout;
- }
-
- if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
- tfilter_notify_chain(net, skb, block, q, parent, n,
- chain, RTM_DELTFILTER);
- tcf_chain_flush(chain);
- err = 0;
+ err = -ENOMEM;
goto errout;
}
@@ -1151,8 +1146,7 @@ replay:
goto errout;
}
- if (n->nlmsg_type != RTM_NEWTFILTER ||
- !(n->nlmsg_flags & NLM_F_CREATE)) {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
err = -ENOENT;
goto errout;
@@ -1177,56 +1171,15 @@ replay:
fh = tp->ops->get(tp, t->tcm_handle);
if (!fh) {
- if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tfilter_notify(net, skb, n, tp, block, q, parent, fh,
- RTM_DELTFILTER, false);
- tcf_proto_destroy(tp, extack);
- err = 0;
- goto errout;
- }
-
- if (n->nlmsg_type != RTM_NEWTFILTER ||
- !(n->nlmsg_flags & NLM_F_CREATE)) {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
err = -ENOENT;
goto errout;
}
- } else {
- bool last;
-
- switch (n->nlmsg_type) {
- case RTM_NEWTFILTER:
- if (n->nlmsg_flags & NLM_F_EXCL) {
- if (tp_created)
- tcf_proto_destroy(tp, NULL);
- NL_SET_ERR_MSG(extack, "Filter already exists");
- err = -EEXIST;
- goto errout;
- }
- break;
- case RTM_DELTFILTER:
- err = tfilter_del_notify(net, skb, n, tp, block,
- q, parent, fh, false, &last,
- extack);
- if (err)
- goto errout;
- if (last) {
- tcf_chain_tp_remove(chain, &chain_info, tp);
- tcf_proto_destroy(tp, extack);
- }
- goto errout;
- case RTM_GETTFILTER:
- err = tfilter_notify(net, skb, n, tp, block, q, parent,
- fh, RTM_NEWTFILTER, true);
- if (err < 0)
- NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
- goto errout;
- default:
- NL_SET_ERR_MSG(extack, "Invalid netlink message type");
- err = -EINVAL;
- goto errout;
- }
+ } else if (n->nlmsg_flags & NLM_F_EXCL) {
+ NL_SET_ERR_MSG(extack, "Filter already exists");
+ err = -EEXIST;
+ goto errout;
}
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
@@ -1251,6 +1204,202 @@ errout:
return err;
}
+static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain_info chain_info;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ struct tcf_proto *tp = NULL;
+ unsigned long cl = 0;
+ void *fh = NULL;
+ int err;
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ parent = t->tcm_parent;
+
+ if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) {
+ NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
+ return -ENOENT;
+ }
+
+ /* Find head of filter chain. */
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
+ }
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout;
+ }
+ chain = tcf_chain_get(block, chain_index, false);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (prio == 0) {
+ tfilter_notify_chain(net, skb, block, q, parent, n,
+ chain, RTM_DELTFILTER);
+ tcf_chain_flush(chain);
+ err = 0;
+ goto errout;
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+ prio, false);
+ if (!tp || IS_ERR(tp)) {
+ NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+ err = tp ? PTR_ERR(tp) : -ENOENT;
+ goto errout;
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (!fh) {
+ if (t->tcm_handle == 0) {
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, false);
+ tcf_proto_destroy(tp, extack);
+ err = 0;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
+ }
+ } else {
+ bool last;
+
+ err = tfilter_del_notify(net, skb, n, tp, block,
+ q, parent, fh, false, &last,
+ extack);
+ if (err)
+ goto errout;
+ if (last) {
+ tcf_chain_tp_remove(chain, &chain_info, tp);
+ tcf_proto_destroy(tp, extack);
+ }
+ }
+
+errout:
+ if (chain)
+ tcf_chain_put(chain);
+ return err;
+}
+
+static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain_info chain_info;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ struct tcf_proto *tp = NULL;
+ unsigned long cl = 0;
+ void *fh = NULL;
+ int err;
+
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ parent = t->tcm_parent;
+
+ if (prio == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
+ return -ENOENT;
+ }
+
+ /* Find head of filter chain. */
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block)) {
+ err = PTR_ERR(block);
+ goto errout;
+ }
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout;
+ }
+ chain = tcf_chain_get(block, chain_index, false);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+ prio, false);
+ if (!tp || IS_ERR(tp)) {
+ NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+ err = tp ? PTR_ERR(tp) : -ENOENT;
+ goto errout;
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+ NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (!fh) {
+ NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+ err = -ENOENT;
+ } else {
+ err = tfilter_notify(net, skb, n, tp, block, q, parent,
+ fh, RTM_NEWTFILTER, true);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
+ }
+
+errout:
+ if (chain)
+ tcf_chain_put(chain);
+ return err;
+}
+
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
@@ -1633,9 +1782,9 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
tc_dump_tfilter, 0);
return 0;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6b7ab3512f5b..95367f37098d 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -35,10 +35,7 @@ struct basic_filter {
struct tcf_result res;
struct tcf_proto *tp;
struct list_head link;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f)
static void basic_delete_filter_work(struct work_struct *work)
{
- struct basic_filter *f = container_of(work, struct basic_filter, work);
-
+ struct basic_filter *f = container_of(to_rcu_work(work),
+ struct basic_filter,
+ rwork);
rtnl_lock();
__basic_delete_filter(f);
rtnl_unlock();
}
-static void basic_delete_filter(struct rcu_head *head)
-{
- struct basic_filter *f = container_of(head, struct basic_filter, rcu);
-
- INIT_WORK(&f->work, basic_delete_filter_work);
- tcf_queue_work(&f->work);
-}
-
static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct basic_head *head = rtnl_dereference(tp->root);
@@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
tcf_unbind_filter(tp, &f->res);
idr_remove(&head->handle_idr, f->handle);
if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, basic_delete_filter);
+ tcf_queue_work(&f->rwork, basic_delete_filter_work);
else
__basic_delete_filter(f);
}
@@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
tcf_unbind_filter(tp, &f->res);
idr_remove(&head->handle_idr, f->handle);
tcf_exts_get_net(&f->exts);
- call_rcu(&f->rcu, basic_delete_filter);
+ tcf_queue_work(&f->rwork, basic_delete_filter_work);
*last = list_empty(&head->flist);
return 0;
}
@@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
list_replace_rcu(&fold->link, &fnew->link);
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
- call_rcu(&fold->rcu, basic_delete_filter);
+ tcf_queue_work(&fold->rwork, basic_delete_filter_work);
} else {
list_add_rcu(&fnew->link, &head->flist);
}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b07c1fa8bc0d..1aa7f6511065 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -49,10 +49,7 @@ struct cls_bpf_prog {
struct sock_filter *bpf_ops;
const char *bpf_name;
struct tcf_proto *tp;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
static void cls_bpf_delete_prog_work(struct work_struct *work)
{
- struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work);
-
+ struct cls_bpf_prog *prog = container_of(to_rcu_work(work),
+ struct cls_bpf_prog,
+ rwork);
rtnl_lock();
__cls_bpf_delete_prog(prog);
rtnl_unlock();
}
-static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
-{
- struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
- INIT_WORK(&prog->work, cls_bpf_delete_prog_work);
- tcf_queue_work(&prog->work);
-}
-
static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
struct netlink_ext_ack *extack)
{
@@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
if (tcf_exts_get_net(&prog->exts))
- call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+ tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work);
else
__cls_bpf_delete_prog(prog);
}
@@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
tcf_exts_get_net(&oldprog->exts);
- call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
+ tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work);
} else {
list_add_rcu(&prog->link, &head->plist);
}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 762da5c0cf5e..3bc01bdde165 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,10 +23,7 @@ struct cls_cgroup_head {
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_proto *tp;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head)
static void cls_cgroup_destroy_work(struct work_struct *work)
{
- struct cls_cgroup_head *head = container_of(work,
+ struct cls_cgroup_head *head = container_of(to_rcu_work(work),
struct cls_cgroup_head,
- work);
+ rwork);
rtnl_lock();
__cls_cgroup_destroy(head);
rtnl_unlock();
}
-static void cls_cgroup_destroy_rcu(struct rcu_head *root)
-{
- struct cls_cgroup_head *head = container_of(root,
- struct cls_cgroup_head,
- rcu);
-
- INIT_WORK(&head->work, cls_cgroup_destroy_work);
- tcf_queue_work(&head->work);
-}
-
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
@@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
rcu_assign_pointer(tp->root, new);
if (head) {
tcf_exts_get_net(&head->exts);
- call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+ tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
}
return 0;
errout:
@@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp,
/* Head can still be NULL due to cls_cgroup_init(). */
if (head) {
if (tcf_exts_get_net(&head->exts))
- call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+ tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
else
__cls_cgroup_destroy(head);
}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index cd5fe383afdd..2bb043cd436b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -57,10 +57,7 @@ struct flow_filter {
u32 divisor;
u32 baseclass;
u32 hashrnd;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static inline u32 addr_fold(void *addr)
@@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f)
static void flow_destroy_filter_work(struct work_struct *work)
{
- struct flow_filter *f = container_of(work, struct flow_filter, work);
-
+ struct flow_filter *f = container_of(to_rcu_work(work),
+ struct flow_filter,
+ rwork);
rtnl_lock();
__flow_destroy_filter(f);
rtnl_unlock();
}
-static void flow_destroy_filter(struct rcu_head *head)
-{
- struct flow_filter *f = container_of(head, struct flow_filter, rcu);
-
- INIT_WORK(&f->work, flow_destroy_filter_work);
- tcf_queue_work(&f->work);
-}
-
static int flow_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
@@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (fold) {
tcf_exts_get_net(&fold->exts);
- call_rcu(&fold->rcu, flow_destroy_filter);
+ tcf_queue_work(&fold->rwork, flow_destroy_filter_work);
}
return 0;
@@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
list_del_rcu(&f->list);
tcf_exts_get_net(&f->exts);
- call_rcu(&f->rcu, flow_destroy_filter);
+ tcf_queue_work(&f->rwork, flow_destroy_filter_work);
*last = list_empty(&head->filters);
return 0;
}
@@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
list_for_each_entry_safe(f, next, &head->filters, list) {
list_del_rcu(&f->list);
if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, flow_destroy_filter);
+ tcf_queue_work(&f->rwork, flow_destroy_filter_work);
else
__flow_destroy_filter(f);
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c79f6e71512e..2b5be42a9f1c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -61,24 +61,24 @@ struct fl_flow_mask_range {
struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
- struct rcu_head rcu;
+ struct rhash_head ht_node;
+ struct rhashtable ht;
+ struct rhashtable_params filter_ht_params;
+ struct flow_dissector dissector;
+ struct list_head filters;
+ struct rcu_head rcu;
+ struct list_head list;
};
struct cls_fl_head {
struct rhashtable ht;
- struct fl_flow_mask mask;
- struct flow_dissector dissector;
- bool mask_assigned;
- struct list_head filters;
- struct rhashtable_params ht_params;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct list_head masks;
+ struct rcu_work rwork;
struct idr handle_idr;
};
struct cls_fl_filter {
+ struct fl_flow_mask *mask;
struct rhash_head ht_node;
struct fl_flow_key mkey;
struct tcf_exts exts;
@@ -87,13 +87,17 @@ struct cls_fl_filter {
struct list_head list;
u32 handle;
u32 flags;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
struct net_device *hw_dev;
};
+static const struct rhashtable_params mask_ht_params = {
+ .key_offset = offsetof(struct fl_flow_mask, key),
+ .key_len = sizeof(struct fl_flow_key),
+ .head_offset = offsetof(struct fl_flow_mask, ht_node),
+ .automatic_shrinking = true,
+};
+
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
{
return mask->range.end - mask->range.start;
@@ -103,13 +107,19 @@ static void fl_mask_update_range(struct fl_flow_mask *mask)
{
const u8 *bytes = (const u8 *) &mask->key;
size_t size = sizeof(mask->key);
- size_t i, first = 0, last = size - 1;
+ size_t i, first = 0, last;
- for (i = 0; i < sizeof(mask->key); i++) {
+ for (i = 0; i < size; i++) {
+ if (bytes[i]) {
+ first = i;
+ break;
+ }
+ }
+ last = first;
+ for (i = size - 1; i != first; i--) {
if (bytes[i]) {
- if (!first && i)
- first = i;
last = i;
+ break;
}
}
mask->range.start = rounddown(first, sizeof(long));
@@ -140,12 +150,11 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
}
-static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
struct fl_flow_key *mkey)
{
- return rhashtable_lookup_fast(&head->ht,
- fl_key_get_start(mkey, &head->mask),
- head->ht_params);
+ return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
+ mask->filter_ht_params);
}
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -153,28 +162,28 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
struct cls_fl_filter *f;
+ struct fl_flow_mask *mask;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
- if (!atomic_read(&head->ht.nelems))
- return -1;
-
- fl_clear_masked_range(&skb_key, &head->mask);
+ list_for_each_entry_rcu(mask, &head->masks, list) {
+ fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
- /* skb_flow_dissect() does not set n_proto in case an unknown protocol,
- * so do it rather here.
- */
- skb_key.basic.n_proto = skb->protocol;
- skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
- skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
+ skb_key.indev_ifindex = skb->skb_iif;
+ /* skb_flow_dissect() does not set n_proto in case an unknown
+ * protocol, so do it rather here.
+ */
+ skb_key.basic.n_proto = skb->protocol;
+ skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
- fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
+ fl_set_masked_key(&skb_mkey, &skb_key, mask);
- f = fl_lookup(head, &skb_mkey);
- if (f && !tc_skip_sw(f->flags)) {
- *res = f->res;
- return tcf_exts_exec(skb, &f->exts, res);
+ f = fl_lookup(mask, &skb_mkey);
+ if (f && !tc_skip_sw(f->flags)) {
+ *res = f->res;
+ return tcf_exts_exec(skb, &f->exts, res);
+ }
}
return -1;
}
@@ -187,11 +196,28 @@ static int fl_init(struct tcf_proto *tp)
if (!head)
return -ENOBUFS;
- INIT_LIST_HEAD_RCU(&head->filters);
+ INIT_LIST_HEAD_RCU(&head->masks);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
- return 0;
+ return rhashtable_init(&head->ht, &mask_ht_params);
+}
+
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
+ bool async)
+{
+ if (!list_empty(&mask->filters))
+ return false;
+
+ rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+ rhashtable_destroy(&mask->ht);
+ list_del_rcu(&mask->list);
+ if (async)
+ kfree_rcu(mask, rcu);
+ else
+ kfree(mask);
+
+ return true;
}
static void __fl_destroy_filter(struct cls_fl_filter *f)
@@ -203,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f)
static void fl_destroy_filter_work(struct work_struct *work)
{
- struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work);
+ struct cls_fl_filter *f = container_of(to_rcu_work(work),
+ struct cls_fl_filter, rwork);
rtnl_lock();
__fl_destroy_filter(f);
rtnl_unlock();
}
-static void fl_destroy_filter(struct rcu_head *head)
-{
- struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
-
- INIT_WORK(&f->work, fl_destroy_filter_work);
- tcf_queue_work(&f->work);
-}
-
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
@@ -234,8 +253,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
- struct flow_dissector *dissector,
- struct fl_flow_key *mask,
struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
@@ -247,8 +264,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = TC_CLSFLOWER_REPLACE;
cls_flower.cookie = (unsigned long) f;
- cls_flower.dissector = dissector;
- cls_flower.mask = mask;
+ cls_flower.dissector = &f->mask->dissector;
+ cls_flower.mask = &f->mask->key;
cls_flower.key = &f->mkey;
cls_flower.exts = &f->exts;
cls_flower.classid = f->res.classid;
@@ -283,51 +300,54 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
&cls_flower, false);
}
-static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
+ bool async = tcf_exts_get_net(&f->exts);
+ bool last;
idr_remove(&head->handle_idr, f->handle);
list_del_rcu(&f->list);
+ last = fl_mask_put(head, f->mask, async);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f, extack);
tcf_unbind_filter(tp, &f->res);
- if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, fl_destroy_filter);
+ if (async)
+ tcf_queue_work(&f->rwork, fl_destroy_filter_work);
else
__fl_destroy_filter(f);
+
+ return last;
}
static void fl_destroy_sleepable(struct work_struct *work)
{
- struct cls_fl_head *head = container_of(work, struct cls_fl_head,
- work);
- if (head->mask_assigned)
- rhashtable_destroy(&head->ht);
+ struct cls_fl_head *head = container_of(to_rcu_work(work),
+ struct cls_fl_head,
+ rwork);
+
+ rhashtable_destroy(&head->ht);
kfree(head);
module_put(THIS_MODULE);
}
-static void fl_destroy_rcu(struct rcu_head *rcu)
-{
- struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
-
- INIT_WORK(&head->work, fl_destroy_sleepable);
- schedule_work(&head->work);
-}
-
static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
- list_for_each_entry_safe(f, next, &head->filters, list)
- __fl_delete(tp, f, extack);
+ list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
+ list_for_each_entry_safe(f, next, &mask->filters, list) {
+ if (__fl_delete(tp, f, extack))
+ break;
+ }
+ }
idr_destroy(&head->handle_idr);
__module_get(THIS_MODULE);
- call_rcu(&head->rcu, fl_destroy_rcu);
+ tcf_queue_work(&head->rwork, fl_destroy_sleepable);
}
static void *fl_get(struct tcf_proto *tp, u32 handle)
@@ -715,14 +735,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
-static bool fl_mask_eq(struct fl_flow_mask *mask1,
- struct fl_flow_mask *mask2)
+static void fl_mask_copy(struct fl_flow_mask *dst,
+ struct fl_flow_mask *src)
{
- const long *lmask1 = fl_key_get_start(&mask1->key, mask1);
- const long *lmask2 = fl_key_get_start(&mask2->key, mask2);
+ const void *psrc = fl_key_get_start(&src->key, src);
+ void *pdst = fl_key_get_start(&dst->key, src);
- return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) &&
- !memcmp(lmask1, lmask2, fl_mask_range(mask1));
+ memcpy(pdst, psrc, fl_mask_range(src));
+ dst->range = src->range;
}
static const struct rhashtable_params fl_ht_params = {
@@ -731,14 +751,13 @@ static const struct rhashtable_params fl_ht_params = {
.automatic_shrinking = true,
};
-static int fl_init_hashtable(struct cls_fl_head *head,
- struct fl_flow_mask *mask)
+static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
{
- head->ht_params = fl_ht_params;
- head->ht_params.key_len = fl_mask_range(mask);
- head->ht_params.key_offset += mask->range.start;
+ mask->filter_ht_params = fl_ht_params;
+ mask->filter_ht_params.key_len = fl_mask_range(mask);
+ mask->filter_ht_params.key_offset += mask->range.start;
- return rhashtable_init(&head->ht, &head->ht_params);
+ return rhashtable_init(&mask->ht, &mask->filter_ht_params);
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
@@ -761,8 +780,7 @@ static int fl_init_hashtable(struct cls_fl_head *head,
FL_KEY_SET(keys, cnt, id, member); \
} while(0);
-static void fl_init_dissector(struct cls_fl_head *head,
- struct fl_flow_mask *mask)
+static void fl_init_dissector(struct fl_flow_mask *mask)
{
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
@@ -802,31 +820,66 @@ static void fl_init_dissector(struct cls_fl_head *head,
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
- skb_flow_dissector_init(&head->dissector, keys, cnt);
+ skb_flow_dissector_init(&mask->dissector, keys, cnt);
+}
+
+static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
+ struct fl_flow_mask *mask)
+{
+ struct fl_flow_mask *newmask;
+ int err;
+
+ newmask = kzalloc(sizeof(*newmask), GFP_KERNEL);
+ if (!newmask)
+ return ERR_PTR(-ENOMEM);
+
+ fl_mask_copy(newmask, mask);
+
+ err = fl_init_mask_hashtable(newmask);
+ if (err)
+ goto errout_free;
+
+ fl_init_dissector(newmask);
+
+ INIT_LIST_HEAD_RCU(&newmask->filters);
+
+ err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
+ mask_ht_params);
+ if (err)
+ goto errout_destroy;
+
+ list_add_tail_rcu(&newmask->list, &head->masks);
+
+ return newmask;
+
+errout_destroy:
+ rhashtable_destroy(&newmask->ht);
+errout_free:
+ kfree(newmask);
+
+ return ERR_PTR(err);
}
static int fl_check_assign_mask(struct cls_fl_head *head,
+ struct cls_fl_filter *fnew,
+ struct cls_fl_filter *fold,
struct fl_flow_mask *mask)
{
- int err;
+ struct fl_flow_mask *newmask;
- if (head->mask_assigned) {
- if (!fl_mask_eq(&head->mask, mask))
+ fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+ if (!fnew->mask) {
+ if (fold)
return -EINVAL;
- else
- return 0;
- }
- /* Mask is not assigned yet. So assign it and init hashtable
- * according to that.
- */
- err = fl_init_hashtable(head, mask);
- if (err)
- return err;
- memcpy(&head->mask, mask, sizeof(head->mask));
- head->mask_assigned = true;
+ newmask = fl_create_new_mask(head, mask);
+ if (IS_ERR(newmask))
+ return PTR_ERR(newmask);
- fl_init_dissector(head, mask);
+ fnew->mask = newmask;
+ } else if (fold && fold->mask != fnew->mask) {
+ return -EINVAL;
+ }
return 0;
}
@@ -924,30 +977,26 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err)
goto errout_idr;
- err = fl_check_assign_mask(head, &mask);
+ err = fl_check_assign_mask(head, fnew, fold, &mask);
if (err)
goto errout_idr;
if (!tc_skip_sw(fnew->flags)) {
- if (!fold && fl_lookup(head, &fnew->mkey)) {
+ if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
err = -EEXIST;
- goto errout_idr;
+ goto errout_mask;
}
- err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
- head->ht_params);
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
if (err)
- goto errout_idr;
+ goto errout_mask;
}
if (!tc_skip_hw(fnew->flags)) {
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- fnew,
- extack);
+ err = fl_hw_replace_filter(tp, fnew, extack);
if (err)
- goto errout_idr;
+ goto errout_mask;
}
if (!tc_in_hw(fnew->flags))
@@ -955,8 +1004,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (fold) {
if (!tc_skip_sw(fold->flags))
- rhashtable_remove_fast(&head->ht, &fold->ht_node,
- head->ht_params);
+ rhashtable_remove_fast(&fold->mask->ht,
+ &fold->ht_node,
+ fold->mask->filter_ht_params);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
}
@@ -968,14 +1018,17 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
list_replace_rcu(&fold->list, &fnew->list);
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
- call_rcu(&fold->rcu, fl_destroy_filter);
+ tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
} else {
- list_add_tail_rcu(&fnew->list, &head->filters);
+ list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
}
kfree(tb);
return 0;
+errout_mask:
+ fl_mask_put(head, fnew->mask, false);
+
errout_idr:
if (!fold)
idr_remove(&head->handle_idr, fnew->handle);
@@ -994,10 +1047,10 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
struct cls_fl_filter *f = arg;
if (!tc_skip_sw(f->flags))
- rhashtable_remove_fast(&head->ht, &f->ht_node,
- head->ht_params);
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
__fl_delete(tp, f, extack);
- *last = list_empty(&head->filters);
+ *last = list_empty(&head->masks);
return 0;
}
@@ -1005,16 +1058,19 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f;
-
- list_for_each_entry_rcu(f, &head->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- break;
- }
+ struct fl_flow_mask *mask;
+
+ list_for_each_entry_rcu(mask, &head->masks, list) {
+ list_for_each_entry_rcu(f, &mask->filters, list) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
skip:
- arg->count++;
+ arg->count++;
+ }
}
}
@@ -1150,7 +1206,6 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f = fh;
struct nlattr *nest;
struct fl_flow_key *key, *mask;
@@ -1169,7 +1224,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
goto nla_put_failure;
key = &f->key;
- mask = &head->mask.key;
+ mask = &f->mask->key;
if (mask->indev_ifindex) {
struct net_device *dev;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 8b207723fbc2..29eeeaf3ea44 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -47,10 +47,7 @@ struct fw_filter {
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static u32 fw_hash(u32 handle)
@@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f)
static void fw_delete_filter_work(struct work_struct *work)
{
- struct fw_filter *f = container_of(work, struct fw_filter, work);
-
+ struct fw_filter *f = container_of(to_rcu_work(work),
+ struct fw_filter,
+ rwork);
rtnl_lock();
__fw_delete_filter(f);
rtnl_unlock();
}
-static void fw_delete_filter(struct rcu_head *head)
-{
- struct fw_filter *f = container_of(head, struct fw_filter, rcu);
-
- INIT_WORK(&f->work, fw_delete_filter_work);
- tcf_queue_work(&f->work);
-}
-
static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct fw_head *head = rtnl_dereference(tp->root);
@@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
rtnl_dereference(f->next));
tcf_unbind_filter(tp, &f->res);
if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, fw_delete_filter);
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
else
__fw_delete_filter(f);
}
@@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
tcf_unbind_filter(tp, &f->res);
tcf_exts_get_net(&f->exts);
- call_rcu(&f->rcu, fw_delete_filter);
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
ret = 0;
break;
}
@@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
rcu_assign_pointer(*fp, fnew);
tcf_unbind_filter(tp, &f->res);
tcf_exts_get_net(&f->exts);
- call_rcu(&f->rcu, fw_delete_filter);
+ tcf_queue_work(&f->rwork, fw_delete_filter_work);
*arg = fnew;
return err;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..47b207ef7762 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,10 +21,7 @@ struct cls_mall_head {
struct tcf_result res;
u32 handle;
u32 flags;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head)
static void mall_destroy_work(struct work_struct *work)
{
- struct cls_mall_head *head = container_of(work, struct cls_mall_head,
- work);
+ struct cls_mall_head *head = container_of(to_rcu_work(work),
+ struct cls_mall_head,
+ rwork);
rtnl_lock();
__mall_destroy(head);
rtnl_unlock();
}
-static void mall_destroy_rcu(struct rcu_head *rcu)
-{
- struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
- rcu);
-
- INIT_WORK(&head->work, mall_destroy_work);
- tcf_queue_work(&head->work);
-}
-
static void mall_destroy_hw_filter(struct tcf_proto *tp,
struct cls_mall_head *head,
unsigned long cookie,
@@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
if (tcf_exts_get_net(&head->exts))
- call_rcu(&head->rcu, mall_destroy_rcu);
+ tcf_queue_work(&head->rwork, mall_destroy_work);
else
__mall_destroy(head);
}
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 21a03a8ee029..0404aa5fa7cb 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -57,10 +57,7 @@ struct route4_filter {
u32 handle;
struct route4_bucket *bkt;
struct tcf_proto *tp;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f)
static void route4_delete_filter_work(struct work_struct *work)
{
- struct route4_filter *f = container_of(work, struct route4_filter, work);
-
+ struct route4_filter *f = container_of(to_rcu_work(work),
+ struct route4_filter,
+ rwork);
rtnl_lock();
__route4_delete_filter(f);
rtnl_unlock();
}
-static void route4_delete_filter(struct rcu_head *head)
+static void route4_queue_work(struct route4_filter *f)
{
- struct route4_filter *f = container_of(head, struct route4_filter, rcu);
-
- INIT_WORK(&f->work, route4_delete_filter_work);
- tcf_queue_work(&f->work);
+ tcf_queue_work(&f->rwork, route4_delete_filter_work);
}
static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
@@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
RCU_INIT_POINTER(b->ht[h2], next);
tcf_unbind_filter(tp, &f->res);
if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, route4_delete_filter);
+ route4_queue_work(f);
else
__route4_delete_filter(f);
}
@@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
/* Delete it */
tcf_unbind_filter(tp, &f->res);
tcf_exts_get_net(&f->exts);
- call_rcu(&f->rcu, route4_delete_filter);
+ tcf_queue_work(&f->rwork, route4_delete_filter_work);
/* Strip RTNL protected tree */
for (i = 0; i <= 32; i++) {
@@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
if (fold) {
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
- call_rcu(&fold->rcu, route4_delete_filter);
+ tcf_queue_work(&fold->rwork, route4_delete_filter_work);
}
return 0;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f1297657c27..e9ccf7daea7d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -97,10 +97,7 @@ struct rsvp_filter {
u32 handle;
struct rsvp_session *sess;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f)
static void rsvp_delete_filter_work(struct work_struct *work)
{
- struct rsvp_filter *f = container_of(work, struct rsvp_filter, work);
-
+ struct rsvp_filter *f = container_of(to_rcu_work(work),
+ struct rsvp_filter,
+ rwork);
rtnl_lock();
__rsvp_delete_filter(f);
rtnl_unlock();
}
-static void rsvp_delete_filter_rcu(struct rcu_head *head)
-{
- struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
-
- INIT_WORK(&f->work, rsvp_delete_filter_work);
- tcf_queue_work(&f->work);
-}
-
static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
* in cleanup() callback
*/
if (tcf_exts_get_net(&f->exts))
- call_rcu(&f->rcu, rsvp_delete_filter_rcu);
+ tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
else
__rsvp_delete_filter(f);
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index b49cc990a000..32f4bbd82f35 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -28,20 +28,14 @@
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
struct tcindex_filter {
u16 key;
struct tcindex_filter_result result;
struct tcindex_filter __rcu *next;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
};
@@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work)
{
struct tcindex_filter_result *r;
- r = container_of(work, struct tcindex_filter_result, work);
+ r = container_of(to_rcu_work(work),
+ struct tcindex_filter_result,
+ rwork);
rtnl_lock();
__tcindex_destroy_rexts(r);
rtnl_unlock();
}
-static void tcindex_destroy_rexts(struct rcu_head *head)
-{
- struct tcindex_filter_result *r;
-
- r = container_of(head, struct tcindex_filter_result, rcu);
- INIT_WORK(&r->work, tcindex_destroy_rexts_work);
- tcf_queue_work(&r->work);
-}
-
static void __tcindex_destroy_fexts(struct tcindex_filter *f)
{
tcf_exts_destroy(&f->result.exts);
@@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f)
static void tcindex_destroy_fexts_work(struct work_struct *work)
{
- struct tcindex_filter *f = container_of(work, struct tcindex_filter,
- work);
+ struct tcindex_filter *f = container_of(to_rcu_work(work),
+ struct tcindex_filter,
+ rwork);
rtnl_lock();
__tcindex_destroy_fexts(f);
rtnl_unlock();
}
-static void tcindex_destroy_fexts(struct rcu_head *head)
-{
- struct tcindex_filter *f = container_of(head, struct tcindex_filter,
- rcu);
-
- INIT_WORK(&f->work, tcindex_destroy_fexts_work);
- tcf_queue_work(&f->work);
-}
-
static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
struct netlink_ext_ack *extack)
{
@@ -228,12 +207,12 @@ found:
*/
if (f) {
if (tcf_exts_get_net(&f->result.exts))
- call_rcu(&f->rcu, tcindex_destroy_fexts);
+ tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
else
__tcindex_destroy_fexts(f);
} else {
if (tcf_exts_get_net(&r->exts))
- call_rcu(&r->rcu, tcindex_destroy_rexts);
+ tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
else
__tcindex_destroy_rexts(r);
}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index bac47b5d18fd..fb861f90fde6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,10 +68,7 @@ struct tc_u_knode {
u32 __percpu *pcpu_success;
#endif
struct tcf_proto *tp;
- union {
- struct work_struct work;
- struct rcu_head rcu;
- };
+ struct rcu_work rwork;
/* The 'sel' field MUST be the last field in structure to allow for
* tc_u32_keys allocated at end of structure.
*/
@@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
*/
static void u32_delete_key_work(struct work_struct *work)
{
- struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+ struct tc_u_knode *key = container_of(to_rcu_work(work),
+ struct tc_u_knode,
+ rwork);
rtnl_lock();
u32_destroy_key(key->tp, key, false);
rtnl_unlock();
}
-static void u32_delete_key_rcu(struct rcu_head *rcu)
-{
- struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
- INIT_WORK(&key->work, u32_delete_key_work);
- tcf_queue_work(&key->work);
-}
-
/* u32_delete_key_freepf_rcu is the rcu callback variant
* that free's the entire structure including the statistics
* percpu variables. Only use this if the key is not a copy
@@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu)
*/
static void u32_delete_key_freepf_work(struct work_struct *work)
{
- struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+ struct tc_u_knode *key = container_of(to_rcu_work(work),
+ struct tc_u_knode,
+ rwork);
rtnl_lock();
u32_destroy_key(key->tp, key, true);
rtnl_unlock();
}
-static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
-{
- struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
- INIT_WORK(&key->work, u32_delete_key_freepf_work);
- tcf_queue_work(&key->work);
-}
-
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
{
struct tc_u_knode __rcu **kp;
@@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
tcf_unbind_filter(tp, &key->res);
idr_remove(&ht->handle_idr, key->handle);
tcf_exts_get_net(&key->exts);
- call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
+ tcf_queue_work(&key->rwork, u32_delete_key_freepf_work);
return 0;
}
}
@@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
u32_remove_hw_knode(tp, n, extack);
idr_remove(&ht->handle_idr, n->handle);
if (tcf_exts_get_net(&n->exts))
- call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
+ tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
else
u32_destroy_key(n->tp, n, true);
}
@@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
u32_replace_knode(tp, tp_c, new);
tcf_unbind_filter(tp, &n->res);
tcf_exts_get_net(&n->exts);
- call_rcu(&n->rcu, u32_delete_key_rcu);
+ tcf_queue_work(&n->rwork, u32_delete_key_work);
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b6ff98..69078c82963e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return false;
}
- if (ret && netif_xmit_frozen_or_stopped(txq))
- return false;
-
return true;
}
@@ -373,33 +370,24 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
*/
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
{
- bool more, validate, nolock = q->flags & TCQ_F_NOLOCK;
spinlock_t *root_lock = NULL;
struct netdev_queue *txq;
struct net_device *dev;
struct sk_buff *skb;
+ bool validate;
/* Dequeue packet */
- if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
- return false;
-
skb = dequeue_skb(q, &validate, packets);
- if (unlikely(!skb)) {
- if (nolock)
- clear_bit(__QDISC_STATE_RUNNING, &q->state);
+ if (unlikely(!skb))
return false;
- }
- if (!nolock)
+ if (!(q->flags & TCQ_F_NOLOCK))
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);
- more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
- if (nolock)
- clear_bit(__QDISC_STATE_RUNNING, &q->state);
- return more;
+ return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
}
void __qdisc_run(struct Qdisc *q)
@@ -665,7 +653,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
if (__skb_array_empty(q))
continue;
- skb = skb_array_consume_bh(q);
+ skb = __skb_array_consume(q);
}
if (likely(skb)) {
qdisc_qstats_cpu_backlog_dec(qdisc, skb);
@@ -706,7 +694,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
if (!q->ring.queue)
continue;
- while ((skb = skb_array_consume_bh(q)) != NULL)
+ while ((skb = __skb_array_consume(q)) != NULL)
kfree_skb(skb);
}
@@ -867,6 +855,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
lockdep_set_class(&sch->busylock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+ /* seqlock has the same scope of busylock, for NOLOCK qdisc */
+ spin_lock_init(&sch->seqlock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
seqcount_init(&sch->running);
lockdep_set_class(&sch->running,
dev->qdisc_running_key ?: &qdisc_running_key);
@@ -1106,6 +1099,10 @@ static void dev_deactivate_queue(struct net_device *dev,
qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
+ bool nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
spin_lock_bh(qdisc_lock(qdisc));
if (!(qdisc->flags & TCQ_F_BUILTIN))
@@ -1115,6 +1112,8 @@ static void dev_deactivate_queue(struct net_device *dev,
qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock)
+ spin_unlock_bh(&qdisc->seqlock);
}
}
@@ -1131,17 +1130,13 @@ static bool some_qdisc_is_busy(struct net_device *dev)
dev_queue = netdev_get_tx_queue(dev, i);
q = dev_queue->qdisc_sleeping;
- if (q->flags & TCQ_F_NOLOCK) {
- val = test_bit(__QDISC_STATE_SCHED, &q->state);
- } else {
- root_lock = qdisc_lock(q);
- spin_lock_bh(root_lock);
+ root_lock = qdisc_lock(q);
+ spin_lock_bh(root_lock);
- val = (qdisc_is_running(q) ||
- test_bit(__QDISC_STATE_SCHED, &q->state));
+ val = (qdisc_is_running(q) ||
+ test_bit(__QDISC_STATE_SCHED, &q->state));
- spin_unlock_bh(root_lock);
- }
+ spin_unlock_bh(root_lock);
if (val)
return true;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f062a18e9162..d6b8ae4ed7a3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -16,6 +16,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
#include <net/sch_generic.h>
@@ -23,12 +24,44 @@ struct mq_sched {
struct Qdisc **qdiscs;
};
+static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mq_qopt_offload opt = {
+ .command = cmd,
+ .handle = sch->handle,
+ };
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
+static void mq_offload_stats(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mq_qopt_offload opt = {
+ .command = TC_MQ_STATS,
+ .handle = sch->handle,
+ .stats = {
+ .bstats = &sch->bstats,
+ .qstats = &sch->qstats,
+ },
+ };
+
+ if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
static void mq_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
+ mq_offload(sch, TC_MQ_DESTROY);
+
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
@@ -70,6 +103,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
}
sch->flags |= TCQ_F_MQROOT;
+
+ mq_offload(sch, TC_MQ_CREATE);
return 0;
}
@@ -127,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.qlen += qdisc->qstats.qlen;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops += qdisc->qstats.drops;
sch->qstats.requeues += qdisc->qstats.requeues;
@@ -135,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
+ mq_offload_stats(sch);
return 0;
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a47179da24e6..5d5a16204d50 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -652,33 +652,20 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
*/
peer->param_flags = asoc->param_flags;
- sctp_transport_route(peer, NULL, sp);
-
/* Initialize the pmtu of the transport. */
- if (peer->param_flags & SPP_PMTUD_DISABLE) {
- if (asoc->pathmtu)
- peer->pathmtu = asoc->pathmtu;
- else
- peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
- }
+ sctp_transport_route(peer, NULL, sp);
/* If this is the first transport addr on this association,
* initialize the association PMTU to the peer's PMTU.
* If not and the current association PMTU is higher than the new
* peer's PMTU, reset the association PMTU to the new peer's PMTU.
*/
- if (asoc->pathmtu)
- asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
- else
- asoc->pathmtu = peer->pathmtu;
-
- pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
- asoc->pathmtu);
+ sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
+ min_t(int, peer->pathmtu, asoc->pathmtu) :
+ peer->pathmtu);
peer->pmtu_pending = 0;
- asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
-
/* The asoc->peer.port might not be meaningful yet, but
* initialize the packet structure anyway.
*/
@@ -988,31 +975,6 @@ out:
return match;
}
-/* Is this the association we are looking for? */
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
- struct net *net,
- const union sctp_addr *laddr,
- const union sctp_addr *paddr)
-{
- struct sctp_transport *transport;
-
- if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
- (htons(asoc->peer.port) == paddr->v4.sin_port) &&
- net_eq(sock_net(asoc->base.sk), net)) {
- transport = sctp_assoc_lookup_paddr(asoc, paddr);
- if (!transport)
- goto out;
-
- if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
- sctp_sk(asoc->base.sk)))
- goto out;
- }
- transport = NULL;
-
-out:
- return transport;
-}
-
/* Do delayed input processing. This is scheduled by sctp_rcv(). */
static void sctp_assoc_bh_rcv(struct work_struct *work)
{
@@ -1434,6 +1396,31 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
}
}
+void sctp_assoc_update_frag_point(struct sctp_association *asoc)
+{
+ int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
+ sctp_datachk_len(&asoc->stream));
+
+ if (asoc->user_frag)
+ frag = min_t(int, frag, asoc->user_frag);
+
+ frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
+ sctp_datachk_len(&asoc->stream));
+
+ asoc->frag_point = SCTP_TRUNC4(frag);
+}
+
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
+{
+ if (asoc->pathmtu != pmtu) {
+ asoc->pathmtu = pmtu;
+ sctp_assoc_update_frag_point(asoc);
+ }
+
+ pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
+ asoc->pathmtu, asoc->frag_point);
+}
+
/* Update the association's pmtu and frag_point by going through all the
* transports. This routine is called when a transport's PMTU has changed.
*/
@@ -1446,24 +1433,16 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
return;
/* Get the lowest pmtu of all the transports. */
- list_for_each_entry(t, &asoc->peer.transport_addr_list,
- transports) {
+ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
if (t->pmtu_pending && t->dst) {
- sctp_transport_update_pmtu(
- t, SCTP_TRUNC4(dst_mtu(t->dst)));
+ sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
pmtu = t->pathmtu;
}
- if (pmtu) {
- asoc->pathmtu = pmtu;
- asoc->frag_point = sctp_frag_point(asoc, pmtu);
- }
-
- pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
- asoc->pathmtu, asoc->frag_point);
+ sctp_assoc_set_pmtu(asoc, pmtu);
}
/* Should we send a SACK to update our peer? */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index be296d633e95..79daa98208c3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
- struct sctp_sock *sp;
- struct sctp_af *af;
int err;
msg = sctp_datamsg_new(GFP_KERNEL);
@@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
/* This is the biggest possible DATA chunk that can fit into
* the packet
*/
- sp = sctp_sk(asoc->base.sk);
- af = sp->pf->af;
- max_data = asoc->pathmtu - af->net_header_len -
- sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
- af->ip_options_len(asoc->base.sk);
- max_data = SCTP_TRUNC4(max_data);
+ max_data = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
@@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
}
}
- /* Check what's our max considering the above */
- max_data = min_t(size_t, max_data, asoc->frag_point);
-
/* Set first_len and then account for possible bundles on first frag */
first_len = max_data;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 690d8557bb7b..e672dee302c7 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
{
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
+ struct sctp_sock *sp = NULL;
struct sock *sk;
- size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
packet->vtag = vtag;
@@ -102,28 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
/* set packet max_size with pathmtu, then calculate overhead */
packet->max_size = tp->pathmtu;
+
if (asoc) {
- struct sctp_sock *sp = sctp_sk(asoc->base.sk);
- struct sctp_af *af = sp->pf->af;
-
- overhead = af->net_header_len +
- af->ip_options_len(asoc->base.sk);
- overhead += sizeof(struct sctphdr);
- packet->overhead = overhead;
- packet->size = overhead;
- } else {
- packet->overhead = overhead;
- packet->size = overhead;
- return;
+ sk = asoc->base.sk;
+ sp = sctp_sk(sk);
}
+ packet->overhead = sctp_mtu_payload(sp, 0, 0);
+ packet->size = packet->overhead;
+
+ if (!asoc)
+ return;
/* update dst or transport pathmtu if in need */
- sk = asoc->base.sk;
if (!sctp_transport_dst_check(tp)) {
- sctp_transport_route(tp, NULL, sctp_sk(sk));
- if (asoc->param_flags & SPP_PMTUD_ENABLE)
- sctp_assoc_sync_pmtu(asoc);
- } else if (!sctp_transport_pmtu_check(tp)) {
+ sctp_transport_route(tp, NULL, sp);
if (asoc->param_flags & SPP_PMTUD_ENABLE)
sctp_assoc_sync_pmtu(asoc);
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f211b3db6a35..d68aa33485a9 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -601,14 +601,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
/*
* Transmit DATA chunks on the retransmit queue. Upon return from
- * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
+ * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
* need to be transmitted by the caller.
* We assume that pkt->transport has already been set.
*
* The return value is a normal kernel error return value.
*/
-static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
- int rtx_timeout, int *start_timer)
+static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
+ int rtx_timeout, int *start_timer, gfp_t gfp)
{
struct sctp_transport *transport = pkt->transport;
struct sctp_chunk *chunk, *chunk1;
@@ -684,12 +684,12 @@ redo:
* control chunks are already freed so there
* is nothing we can do.
*/
- sctp_packet_transmit(pkt, GFP_ATOMIC);
+ sctp_packet_transmit(pkt, gfp);
goto redo;
}
/* Send this packet. */
- error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+ error = sctp_packet_transmit(pkt, gfp);
/* If we are retransmitting, we should only
* send a single packet.
@@ -705,7 +705,7 @@ redo:
case SCTP_XMIT_RWND_FULL:
/* Send this packet. */
- error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+ error = sctp_packet_transmit(pkt, gfp);
/* Stop sending DATA as there is no more room
* at the receiver.
@@ -715,7 +715,7 @@ redo:
case SCTP_XMIT_DELAY:
/* Send this packet. */
- error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+ error = sctp_packet_transmit(pkt, gfp);
/* Stop sending DATA because of nagle delay. */
done = 1;
@@ -776,68 +776,43 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
sctp_outq_flush(q, 0, gfp);
}
-
-/*
- * Try to flush an outqueue.
- *
- * Description: Send everything in q which we legally can, subject to
- * congestion limitations.
- * * Note: This function can be called from multiple contexts so appropriate
- * locking concerns must be made. Today we use the sock lock to protect
- * this function.
- */
-static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static int sctp_packet_singleton(struct sctp_transport *transport,
+ struct sctp_chunk *chunk, gfp_t gfp)
{
- struct sctp_packet *packet;
+ const struct sctp_association *asoc = transport->asoc;
+ const __u16 sport = asoc->base.bind_addr.port;
+ const __u16 dport = asoc->peer.port;
+ const __u32 vtag = asoc->peer.i.init_tag;
struct sctp_packet singleton;
- struct sctp_association *asoc = q->asoc;
- __u16 sport = asoc->base.bind_addr.port;
- __u16 dport = asoc->peer.port;
- __u32 vtag = asoc->peer.i.init_tag;
- struct sctp_transport *transport = NULL;
- struct sctp_transport *new_transport;
- struct sctp_chunk *chunk, *tmp;
- enum sctp_xmit status;
- int error = 0;
- int start_timer = 0;
- int one_packet = 0;
+ sctp_packet_init(&singleton, transport, sport, dport);
+ sctp_packet_config(&singleton, vtag, 0);
+ sctp_packet_append_chunk(&singleton, chunk);
+ return sctp_packet_transmit(&singleton, gfp);
+}
+
+/* Struct to hold the context during sctp outq flush */
+struct sctp_flush_ctx {
+ struct sctp_outq *q;
+ /* Current transport being used. It's NOT the same as curr active one */
+ struct sctp_transport *transport;
/* These transports have chunks to send. */
struct list_head transport_list;
- struct list_head *ltransport;
-
- INIT_LIST_HEAD(&transport_list);
- packet = NULL;
-
- /*
- * 6.10 Bundling
- * ...
- * When bundling control chunks with DATA chunks, an
- * endpoint MUST place control chunks first in the outbound
- * SCTP packet. The transmitter MUST transmit DATA chunks
- * within a SCTP packet in increasing order of TSN.
- * ...
- */
-
- list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
- /* RFC 5061, 5.3
- * F1) This means that until such time as the ASCONF
- * containing the add is acknowledged, the sender MUST
- * NOT use the new IP address as a source for ANY SCTP
- * packet except on carrying an ASCONF Chunk.
- */
- if (asoc->src_out_of_asoc_ok &&
- chunk->chunk_hdr->type != SCTP_CID_ASCONF)
- continue;
-
- list_del_init(&chunk->list);
+ struct sctp_association *asoc;
+ /* Packet on the current transport above */
+ struct sctp_packet *packet;
+ gfp_t gfp;
+};
- /* Pick the right transport to use. */
- new_transport = chunk->transport;
+/* transport: current transport */
+static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_transport *new_transport = chunk->transport;
- if (!new_transport) {
- /*
- * If we have a prior transport pointer, see if
+ if (!new_transport) {
+ if (!sctp_chunk_is_data(chunk)) {
+ /* If we have a prior transport pointer, see if
* the destination address of the chunk
* matches the destination address of the
* current transport. If not a match, then
@@ -846,22 +821,26 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
* after processing ASCONFs, we may have new
* transports created.
*/
- if (transport &&
- sctp_cmp_addr_exact(&chunk->dest,
- &transport->ipaddr))
- new_transport = transport;
+ if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest,
+ &ctx->transport->ipaddr))
+ new_transport = ctx->transport;
else
- new_transport = sctp_assoc_lookup_paddr(asoc,
- &chunk->dest);
+ new_transport = sctp_assoc_lookup_paddr(ctx->asoc,
+ &chunk->dest);
+ }
- /* if we still don't have a new transport, then
- * use the current active path.
- */
- if (!new_transport)
- new_transport = asoc->peer.active_path;
- } else if ((new_transport->state == SCTP_INACTIVE) ||
- (new_transport->state == SCTP_UNCONFIRMED) ||
- (new_transport->state == SCTP_PF)) {
+ /* if we still don't have a new transport, then
+ * use the current active path.
+ */
+ if (!new_transport)
+ new_transport = ctx->asoc->peer.active_path;
+ } else {
+ __u8 type;
+
+ switch (new_transport->state) {
+ case SCTP_INACTIVE:
+ case SCTP_UNCONFIRMED:
+ case SCTP_PF:
/* If the chunk is Heartbeat or Heartbeat Ack,
* send it to chunk->transport, even if it's
* inactive.
@@ -875,29 +854,64 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
*
* ASCONF_ACKs also must be sent to the source.
*/
- if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT &&
- chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK &&
- chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK)
- new_transport = asoc->peer.active_path;
+ type = chunk->chunk_hdr->type;
+ if (type != SCTP_CID_HEARTBEAT &&
+ type != SCTP_CID_HEARTBEAT_ACK &&
+ type != SCTP_CID_ASCONF_ACK)
+ new_transport = ctx->asoc->peer.active_path;
+ break;
+ default:
+ break;
}
+ }
+
+ /* Are we switching transports? Take care of transport locks. */
+ if (new_transport != ctx->transport) {
+ ctx->transport = new_transport;
+ ctx->packet = &ctx->transport->packet;
- /* Are we switching transports?
- * Take care of transport locks.
+ if (list_empty(&ctx->transport->send_ready))
+ list_add_tail(&ctx->transport->send_ready,
+ &ctx->transport_list);
+
+ sctp_packet_config(ctx->packet,
+ ctx->asoc->peer.i.init_tag,
+ ctx->asoc->peer.ecn_capable);
+ /* We've switched transports, so apply the
+ * Burst limit to the new transport.
*/
- if (new_transport != transport) {
- transport = new_transport;
- if (list_empty(&transport->send_ready)) {
- list_add_tail(&transport->send_ready,
- &transport_list);
- }
- packet = &transport->packet;
- sctp_packet_config(packet, vtag,
- asoc->peer.ecn_capable);
- }
+ sctp_transport_burst_limited(ctx->transport);
+ }
+}
+
+static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
+{
+ struct sctp_chunk *chunk, *tmp;
+ enum sctp_xmit status;
+ int one_packet, error;
+
+ list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) {
+ one_packet = 0;
+
+ /* RFC 5061, 5.3
+ * F1) This means that until such time as the ASCONF
+ * containing the add is acknowledged, the sender MUST
+ * NOT use the new IP address as a source for ANY SCTP
+ * packet except on carrying an ASCONF Chunk.
+ */
+ if (ctx->asoc->src_out_of_asoc_ok &&
+ chunk->chunk_hdr->type != SCTP_CID_ASCONF)
+ continue;
+
+ list_del_init(&chunk->list);
+
+ /* Pick the right transport to use. Should always be true for
+ * the first chunk as we don't have a transport by then.
+ */
+ sctp_outq_select_transport(ctx, chunk);
switch (chunk->chunk_hdr->type) {
- /*
- * 6.10 Bundling
+ /* 6.10 Bundling
* ...
* An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
* COMPLETE with any other chunks. [Send them immediately.]
@@ -905,20 +919,19 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
case SCTP_CID_INIT:
case SCTP_CID_INIT_ACK:
case SCTP_CID_SHUTDOWN_COMPLETE:
- sctp_packet_init(&singleton, transport, sport, dport);
- sctp_packet_config(&singleton, vtag, 0);
- sctp_packet_append_chunk(&singleton, chunk);
- error = sctp_packet_transmit(&singleton, gfp);
+ error = sctp_packet_singleton(ctx->transport, chunk,
+ ctx->gfp);
if (error < 0) {
- asoc->base.sk->sk_err = -error;
+ ctx->asoc->base.sk->sk_err = -error;
return;
}
break;
case SCTP_CID_ABORT:
if (sctp_test_T_bit(chunk))
- packet->vtag = asoc->c.my_vtag;
+ ctx->packet->vtag = ctx->asoc->c.my_vtag;
/* fallthru */
+
/* The following chunks are "response" chunks, i.e.
* they are generated in response to something we
* received. If we are sending these, then we can
@@ -942,27 +955,27 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
case SCTP_CID_FWD_TSN:
case SCTP_CID_I_FWD_TSN:
case SCTP_CID_RECONF:
- status = sctp_packet_transmit_chunk(packet, chunk,
- one_packet, gfp);
- if (status != SCTP_XMIT_OK) {
+ status = sctp_packet_transmit_chunk(ctx->packet, chunk,
+ one_packet, ctx->gfp);
+ if (status != SCTP_XMIT_OK) {
/* put the chunk back */
- list_add(&chunk->list, &q->control_chunk_list);
+ list_add(&chunk->list, &ctx->q->control_chunk_list);
break;
}
- asoc->stats.octrlchunks++;
+ ctx->asoc->stats.octrlchunks++;
/* PR-SCTP C5) If a FORWARD TSN is sent, the
* sender MUST assure that at least one T3-rtx
* timer is running.
*/
if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
- sctp_transport_reset_t3_rtx(transport);
- transport->last_time_sent = jiffies;
+ sctp_transport_reset_t3_rtx(ctx->transport);
+ ctx->transport->last_time_sent = jiffies;
}
- if (chunk == asoc->strreset_chunk)
- sctp_transport_reset_reconf_timer(transport);
+ if (chunk == ctx->asoc->strreset_chunk)
+ sctp_transport_reset_reconf_timer(ctx->transport);
break;
@@ -971,232 +984,186 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
BUG();
}
}
+}
- if (q->asoc->src_out_of_asoc_ok)
- goto sctp_flush_out;
+/* Returns false if new data shouldn't be sent */
+static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
+ int rtx_timeout)
+{
+ int error, start_timer = 0;
+
+ if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
+ return false;
+
+ if (ctx->transport != ctx->asoc->peer.retran_path) {
+ /* Switch transports & prepare the packet. */
+ ctx->transport = ctx->asoc->peer.retran_path;
+ ctx->packet = &ctx->transport->packet;
+
+ if (list_empty(&ctx->transport->send_ready))
+ list_add_tail(&ctx->transport->send_ready,
+ &ctx->transport_list);
+
+ sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag,
+ ctx->asoc->peer.ecn_capable);
+ }
+
+ error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout,
+ &start_timer, ctx->gfp);
+ if (error < 0)
+ ctx->asoc->base.sk->sk_err = -error;
+
+ if (start_timer) {
+ sctp_transport_reset_t3_rtx(ctx->transport);
+ ctx->transport->last_time_sent = jiffies;
+ }
+
+ /* This can happen on COOKIE-ECHO resend. Only
+ * one chunk can get bundled with a COOKIE-ECHO.
+ */
+ if (ctx->packet->has_cookie_echo)
+ return false;
+
+ /* Don't send new data if there is still data
+ * waiting to retransmit.
+ */
+ if (!list_empty(&ctx->q->retransmit))
+ return false;
+
+ return true;
+}
+
+static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
+ int rtx_timeout)
+{
+ struct sctp_chunk *chunk;
+ enum sctp_xmit status;
/* Is it OK to send data chunks? */
- switch (asoc->state) {
+ switch (ctx->asoc->state) {
case SCTP_STATE_COOKIE_ECHOED:
/* Only allow bundling when this packet has a COOKIE-ECHO
* chunk.
*/
- if (!packet || !packet->has_cookie_echo)
- break;
+ if (!ctx->packet || !ctx->packet->has_cookie_echo)
+ return;
/* fallthru */
case SCTP_STATE_ESTABLISHED:
case SCTP_STATE_SHUTDOWN_PENDING:
case SCTP_STATE_SHUTDOWN_RECEIVED:
- /*
- * RFC 2960 6.1 Transmission of DATA Chunks
- *
- * C) When the time comes for the sender to transmit,
- * before sending new DATA chunks, the sender MUST
- * first transmit any outstanding DATA chunks which
- * are marked for retransmission (limited by the
- * current cwnd).
- */
- if (!list_empty(&q->retransmit)) {
- if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
- goto sctp_flush_out;
- if (transport == asoc->peer.retran_path)
- goto retran;
-
- /* Switch transports & prepare the packet. */
-
- transport = asoc->peer.retran_path;
+ break;
- if (list_empty(&transport->send_ready)) {
- list_add_tail(&transport->send_ready,
- &transport_list);
- }
+ default:
+ /* Do nothing. */
+ return;
+ }
- packet = &transport->packet;
- sctp_packet_config(packet, vtag,
- asoc->peer.ecn_capable);
- retran:
- error = sctp_outq_flush_rtx(q, packet,
- rtx_timeout, &start_timer);
- if (error < 0)
- asoc->base.sk->sk_err = -error;
+ /* RFC 2960 6.1 Transmission of DATA Chunks
+ *
+ * C) When the time comes for the sender to transmit,
+ * before sending new DATA chunks, the sender MUST
+ * first transmit any outstanding DATA chunks which
+ * are marked for retransmission (limited by the
+ * current cwnd).
+ */
+ if (!list_empty(&ctx->q->retransmit) &&
+ !sctp_outq_flush_rtx(ctx, rtx_timeout))
+ return;
- if (start_timer) {
- sctp_transport_reset_t3_rtx(transport);
- transport->last_time_sent = jiffies;
- }
+ /* Apply Max.Burst limitation to the current transport in
+ * case it will be used for new data. We are going to
+ * rest it before we return, but we want to apply the limit
+ * to the currently queued data.
+ */
+ if (ctx->transport)
+ sctp_transport_burst_limited(ctx->transport);
- /* This can happen on COOKIE-ECHO resend. Only
- * one chunk can get bundled with a COOKIE-ECHO.
- */
- if (packet->has_cookie_echo)
- goto sctp_flush_out;
+ /* Finally, transmit new packets. */
+ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
+ __u32 sid = ntohs(chunk->subh.data_hdr->stream);
- /* Don't send new data if there is still data
- * waiting to retransmit.
- */
- if (!list_empty(&q->retransmit))
- goto sctp_flush_out;
+ /* Has this chunk expired? */
+ if (sctp_chunk_abandoned(chunk)) {
+ sctp_sched_dequeue_done(ctx->q, chunk);
+ sctp_chunk_fail(chunk, 0);
+ sctp_chunk_free(chunk);
+ continue;
}
- /* Apply Max.Burst limitation to the current transport in
- * case it will be used for new data. We are going to
- * rest it before we return, but we want to apply the limit
- * to the currently queued data.
- */
- if (transport)
- sctp_transport_burst_limited(transport);
-
- /* Finally, transmit new packets. */
- while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
- __u32 sid = ntohs(chunk->subh.data_hdr->stream);
-
- /* Has this chunk expired? */
- if (sctp_chunk_abandoned(chunk)) {
- sctp_sched_dequeue_done(q, chunk);
- sctp_chunk_fail(chunk, 0);
- sctp_chunk_free(chunk);
- continue;
- }
+ if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
+ sctp_outq_head_data(ctx->q, chunk);
+ break;
+ }
- if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
- sctp_outq_head_data(q, chunk);
- goto sctp_flush_out;
- }
+ sctp_outq_select_transport(ctx, chunk);
- /* If there is a specified transport, use it.
- * Otherwise, we want to use the active path.
+ pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n",
+ __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
+ chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
+ refcount_read(&chunk->skb->users) : -1);
+
+ /* Add the chunk to the packet. */
+ status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0,
+ ctx->gfp);
+ if (status != SCTP_XMIT_OK) {
+ /* We could not append this chunk, so put
+ * the chunk back on the output queue.
*/
- new_transport = chunk->transport;
- if (!new_transport ||
- ((new_transport->state == SCTP_INACTIVE) ||
- (new_transport->state == SCTP_UNCONFIRMED) ||
- (new_transport->state == SCTP_PF)))
- new_transport = asoc->peer.active_path;
- if (new_transport->state == SCTP_UNCONFIRMED) {
- WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
- sctp_sched_dequeue_done(q, chunk);
- sctp_chunk_fail(chunk, 0);
- sctp_chunk_free(chunk);
- continue;
- }
-
- /* Change packets if necessary. */
- if (new_transport != transport) {
- transport = new_transport;
+ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
+ __func__, ntohl(chunk->subh.data_hdr->tsn),
+ status);
- /* Schedule to have this transport's
- * packet flushed.
- */
- if (list_empty(&transport->send_ready)) {
- list_add_tail(&transport->send_ready,
- &transport_list);
- }
-
- packet = &transport->packet;
- sctp_packet_config(packet, vtag,
- asoc->peer.ecn_capable);
- /* We've switched transports, so apply the
- * Burst limit to the new transport.
- */
- sctp_transport_burst_limited(transport);
- }
-
- pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
- "skb->users:%d\n",
- __func__, q, chunk, chunk && chunk->chunk_hdr ?
- sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
- "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
- chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
- refcount_read(&chunk->skb->users) : -1);
-
- /* Add the chunk to the packet. */
- status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
-
- switch (status) {
- case SCTP_XMIT_PMTU_FULL:
- case SCTP_XMIT_RWND_FULL:
- case SCTP_XMIT_DELAY:
- /* We could not append this chunk, so put
- * the chunk back on the output queue.
- */
- pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
- __func__, ntohl(chunk->subh.data_hdr->tsn),
- status);
-
- sctp_outq_head_data(q, chunk);
- goto sctp_flush_out;
-
- case SCTP_XMIT_OK:
- /* The sender is in the SHUTDOWN-PENDING state,
- * The sender MAY set the I-bit in the DATA
- * chunk header.
- */
- if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
- chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
- if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
- asoc->stats.ouodchunks++;
- else
- asoc->stats.oodchunks++;
-
- /* Only now it's safe to consider this
- * chunk as sent, sched-wise.
- */
- sctp_sched_dequeue_done(q, chunk);
-
- break;
+ sctp_outq_head_data(ctx->q, chunk);
+ break;
+ }
- default:
- BUG();
- }
+ /* The sender is in the SHUTDOWN-PENDING state,
+ * The sender MAY set the I-bit in the DATA
+ * chunk header.
+ */
+ if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+ chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ ctx->asoc->stats.ouodchunks++;
+ else
+ ctx->asoc->stats.oodchunks++;
- /* BUG: We assume that the sctp_packet_transmit()
- * call below will succeed all the time and add the
- * chunk to the transmitted list and restart the
- * timers.
- * It is possible that the call can fail under OOM
- * conditions.
- *
- * Is this really a problem? Won't this behave
- * like a lost TSN?
- */
- list_add_tail(&chunk->transmitted_list,
- &transport->transmitted);
+ /* Only now it's safe to consider this
+ * chunk as sent, sched-wise.
+ */
+ sctp_sched_dequeue_done(ctx->q, chunk);
- sctp_transport_reset_t3_rtx(transport);
- transport->last_time_sent = jiffies;
+ list_add_tail(&chunk->transmitted_list,
+ &ctx->transport->transmitted);
- /* Only let one DATA chunk get bundled with a
- * COOKIE-ECHO chunk.
- */
- if (packet->has_cookie_echo)
- goto sctp_flush_out;
- }
- break;
+ sctp_transport_reset_t3_rtx(ctx->transport);
+ ctx->transport->last_time_sent = jiffies;
- default:
- /* Do nothing. */
- break;
+ /* Only let one DATA chunk get bundled with a
+ * COOKIE-ECHO chunk.
+ */
+ if (ctx->packet->has_cookie_echo)
+ break;
}
+}
-sctp_flush_out:
+static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
+{
+ struct list_head *ltransport;
+ struct sctp_packet *packet;
+ struct sctp_transport *t;
+ int error = 0;
- /* Before returning, examine all the transports touched in
- * this call. Right now, we bluntly force clear all the
- * transports. Things might change after we implement Nagle.
- * But such an examination is still required.
- *
- * --xguo
- */
- while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) {
- struct sctp_transport *t = list_entry(ltransport,
- struct sctp_transport,
- send_ready);
+ while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) {
+ t = list_entry(ltransport, struct sctp_transport, send_ready);
packet = &t->packet;
if (!sctp_packet_empty(packet)) {
- error = sctp_packet_transmit(packet, gfp);
+ error = sctp_packet_transmit(packet, ctx->gfp);
if (error < 0)
- asoc->base.sk->sk_err = -error;
+ ctx->q->asoc->base.sk->sk_err = -error;
}
/* Clear the burst limited state, if any */
@@ -1204,6 +1171,47 @@ sctp_flush_out:
}
}
+/* Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made. Today we use the sock lock to protect
+ * this function.
+ */
+
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+{
+ struct sctp_flush_ctx ctx = {
+ .q = q,
+ .transport = NULL,
+ .transport_list = LIST_HEAD_INIT(ctx.transport_list),
+ .asoc = q->asoc,
+ .packet = NULL,
+ .gfp = gfp,
+ };
+
+ /* 6.10 Bundling
+ * ...
+ * When bundling control chunks with DATA chunks, an
+ * endpoint MUST place control chunks first in the outbound
+ * SCTP packet. The transmitter MUST transmit DATA chunks
+ * within a SCTP packet in increasing order of TSN.
+ * ...
+ */
+
+ sctp_outq_flush_ctrl(&ctx);
+
+ if (q->asoc->src_out_of_asoc_ok)
+ goto sctp_flush_out;
+
+ sctp_outq_flush_data(&ctx, rtx_timeout);
+
+sctp_flush_out:
+
+ sctp_outq_flush_transports(&ctx);
+}
+
/* Update unack_data based on the incoming SACK chunk */
static void sctp_sack_update_unack_data(struct sctp_association *assoc,
struct sctp_sackhdr *sack)
@@ -1457,7 +1465,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* the outstanding bytes for this chunk, so only
* count bytes associated with a transport.
*/
- if (transport) {
+ if (transport && !tchunk->tsn_gap_acked) {
/* If this chunk is being used for RTT
* measurement, calculate the RTT and update
* the RTO using this value.
@@ -1469,14 +1477,34 @@ static void sctp_check_transmitted(struct sctp_outq *q,
* first instance of the packet or a later
* instance).
*/
- if (!tchunk->tsn_gap_acked &&
- !sctp_chunk_retransmitted(tchunk) &&
+ if (!sctp_chunk_retransmitted(tchunk) &&
tchunk->rtt_in_progress) {
tchunk->rtt_in_progress = 0;
rtt = jiffies - tchunk->sent_at;
sctp_transport_update_rto(transport,
rtt);
}
+
+ if (TSN_lte(tsn, sack_ctsn)) {
+ /*
+ * SFR-CACC algorithm:
+ * 2) If the SACK contains gap acks
+ * and the flag CHANGEOVER_ACTIVE is
+ * set the receiver of the SACK MUST
+ * take the following action:
+ *
+ * B) For each TSN t being acked that
+ * has not been acked in any SACK so
+ * far, set cacc_saw_newack to 1 for
+ * the destination that the TSN was
+ * sent to.
+ */
+ if (sack->num_gap_ack_blocks &&
+ q->asoc->peer.primary_path->cacc.
+ changeover_active)
+ transport->cacc.cacc_saw_newack
+ = 1;
+ }
}
/* If the chunk hasn't been marked as ACKED,
@@ -1508,28 +1536,6 @@ static void sctp_check_transmitted(struct sctp_outq *q,
restart_timer = 1;
forward_progress = true;
- if (!tchunk->tsn_gap_acked) {
- /*
- * SFR-CACC algorithm:
- * 2) If the SACK contains gap acks
- * and the flag CHANGEOVER_ACTIVE is
- * set the receiver of the SACK MUST
- * take the following action:
- *
- * B) For each TSN t being acked that
- * has not been acked in any SACK so
- * far, set cacc_saw_newack to 1 for
- * the destination that the TSN was
- * sent to.
- */
- if (transport &&
- sack->num_gap_ack_blocks &&
- q->asoc->peer.primary_path->cacc.
- changeover_active)
- transport->cacc.cacc_saw_newack
- = 1;
- }
-
list_add_tail(&tchunk->transmitted_list,
&q->sacked);
} else {
@@ -1756,7 +1762,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
if (TSN_lte(tsn, ctsn))
goto pass;
- /* 3.3.4 Selective Acknowledgement (SACK) (3):
+ /* 3.3.4 Selective Acknowledgment (SACK) (3):
*
* Gap Ack Blocks:
* These fields contain the Gap Ack Blocks. They are repeated
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e62addb60434..4a4fd1971255 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc,
gfp_t gfp);
static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
const void *data);
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
- const void *data);
/* Control chunk destructor */
static void sctp_control_release_owner(struct sk_buff *skb)
@@ -154,12 +152,11 @@ static const struct sctp_paramhdr prsctp_param = {
cpu_to_be16(sizeof(struct sctp_paramhdr)),
};
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk.
+/* A helper to initialize an op error inside a provided chunk, as most
+ * cause codes will be embedded inside an abort chunk.
*/
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+ size_t paylen)
{
struct sctp_errhdr err;
__u16 len;
@@ -167,33 +164,16 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
/* Cause code constants are now defined in network order. */
err.cause = cause_code;
len = sizeof(err) + paylen;
- err.length = htons(len);
- chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
-}
-
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk. Differs from sctp_init_cause in that it won't oops
- * if there isn't enough space in the op error chunk
- */
-static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
-{
- struct sctp_errhdr err;
- __u16 len;
-
- /* Cause code constants are now defined in network order. */
- err.cause = cause_code;
- len = sizeof(err) + paylen;
- err.length = htons(len);
+ err.length = htons(len);
if (skb_tailroom(chunk->skb) < len)
return -ENOSPC;
- chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err);
+ chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
return 0;
}
+
/* 3.3.2 Initiation (INIT) (1)
*
* This chunk is used to initiate a SCTP association between two
@@ -779,10 +759,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
* association. This reports on which TSN's we've seen to date,
* including duplicates and gaps.
*/
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
{
struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
- struct sctp_association *aptr = (struct sctp_association *)asoc;
struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
__u16 num_gabs, num_dup_tsns;
struct sctp_transport *trans;
@@ -857,7 +836,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
/* Add the duplicate TSN information. */
if (num_dup_tsns) {
- aptr->stats.idupchunks += num_dup_tsns;
+ asoc->stats.idupchunks += num_dup_tsns;
sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
sctp_tsnmap_get_dups(map));
}
@@ -869,11 +848,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
* association so no transport will match after a wrap event like this,
* Until the next sack
*/
- if (++aptr->peer.sack_generation == 0) {
+ if (++asoc->peer.sack_generation == 0) {
list_for_each_entry(trans, &asoc->peer.transport_addr_list,
transports)
trans->sack_generation = 0;
- aptr->peer.sack_generation = 1;
+ asoc->peer.sack_generation = 1;
}
nodata:
return retval;
@@ -1258,20 +1237,26 @@ nodata:
return retval;
}
-/* Create an Operation Error chunk of a fixed size,
- * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
- * This is a helper function to allocate an error chunk for
- * for those invalid parameter codes in which we may not want
- * to report all the errors, if the incoming chunk is large
+/* Create an Operation Error chunk of a fixed size, specifically,
+ * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
+ * This is a helper function to allocate an error chunk for for those
+ * invalid parameter codes in which we may not want to report all the
+ * errors, if the incoming chunk is large. If it can't fit in a single
+ * packet, we ignore it.
*/
-static inline struct sctp_chunk *sctp_make_op_error_fixed(
+static inline struct sctp_chunk *sctp_make_op_error_limited(
const struct sctp_association *asoc,
const struct sctp_chunk *chunk)
{
- size_t size = asoc ? asoc->pathmtu : 0;
+ size_t size = SCTP_DEFAULT_MAXSEGMENT;
+ struct sctp_sock *sp = NULL;
- if (!size)
- size = SCTP_DEFAULT_MAXSEGMENT;
+ if (asoc) {
+ size = min_t(size_t, size, asoc->pathmtu);
+ sp = sctp_sk(asoc->base.sk);
+ }
+
+ size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
return sctp_make_op_error_space(asoc, chunk, size);
}
@@ -1523,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
return target;
}
-/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
- * space in the chunk
- */
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
- int len, const void *data)
-{
- if (skb_tailroom(chunk->skb) >= len)
- return sctp_addto_chunk(chunk, len, data);
- else
- return NULL;
-}
-
/* Append bytes from user space to the end of a chunk. Will panic if
* chunk is not big enough.
* Returns a kernel err value.
@@ -1829,6 +1802,9 @@ no_hmac:
kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
+ suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
+ __be32 n = htonl(usecs);
+
/*
* Section 3.3.10.3 Stale Cookie Error (3)
*
@@ -1837,17 +1813,12 @@ no_hmac:
* Stale Cookie Error: Indicates the receipt of a valid State
* Cookie that has expired.
*/
- len = ntohs(chunk->chunk_hdr->length);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
- if (*errp) {
- suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
- __be32 n = htonl(usecs);
-
- sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
- sizeof(n));
- sctp_addto_chunk(*errp, sizeof(n), &n);
+ *errp = sctp_make_op_error(asoc, chunk,
+ SCTP_ERROR_STALE_COOKIE, &n,
+ sizeof(n), 0);
+ if (*errp)
*error = -SCTP_IERROR_STALE_COOKIE;
- } else
+ else
*error = -SCTP_IERROR_NOMEM;
goto fail;
@@ -1998,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
if (*errp)
sctp_chunk_free(*errp);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
-
- if (*errp) {
- sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
- sctp_addto_chunk(*errp, len, param.v);
- }
+ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
+ param.v, len, 0);
/* Stop processing this chunk. */
return 0;
@@ -2128,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param(
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
*/
- if (NULL == *errp)
- *errp = sctp_make_op_error_fixed(asoc, chunk);
-
- if (*errp) {
- if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- SCTP_PAD4(ntohs(param.p->length))))
- sctp_addto_chunk_fixed(*errp,
- SCTP_PAD4(ntohs(param.p->length)),
- param.v);
- } else {
- /* If there is no memory for generating the ERROR
- * report as specified, an ABORT will be triggered
- * to the peer and the association won't be
- * established.
- */
- retval = SCTP_IERROR_NOMEM;
+ if (!*errp) {
+ *errp = sctp_make_op_error_limited(asoc, chunk);
+ if (!*errp) {
+ /* If there is no memory for generating the
+ * ERROR report as specified, an ABORT will be
+ * triggered to the peer and the association
+ * won't be established.
+ */
+ retval = SCTP_IERROR_NOMEM;
+ break;
+ }
}
+
+ if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ ntohs(param.p->length)))
+ sctp_addto_chunk(*errp, ntohs(param.p->length),
+ param.v);
break;
default:
break;
@@ -2220,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
* MUST be aborted. The ABORT chunk SHOULD contain the error
* cause 'Protocol Violation'.
*/
- if (SCTP_AUTH_RANDOM_LENGTH !=
- ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) {
+ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
+ sizeof(struct sctp_paramhdr)) {
sctp_process_inv_paramlength(asoc, param.p,
- chunk, err_chunk);
+ chunk, err_chunk);
retval = SCTP_IERROR_ABORT;
}
break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf747094d26b..d20f7addee19 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -644,16 +644,15 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
list_for_each_entry(trans,
&asoc->peer.transport_addr_list, transports) {
- /* Clear the source and route cache */
- sctp_transport_dst_release(trans);
trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
2*asoc->pathmtu, 4380));
trans->ssthresh = asoc->peer.i.a_rwnd;
trans->rto = asoc->rto_initial;
sctp_max_rto(asoc, trans);
trans->rtt = trans->srtt = trans->rttvar = 0;
+ /* Clear the source and route cache */
sctp_transport_route(trans, NULL,
- sctp_sk(asoc->base.sk));
+ sctp_sk(asoc->base.sk));
}
}
retval = sctp_send_asconf(asoc, chunk);
@@ -896,7 +895,6 @@ skip_mkasconf:
*/
list_for_each_entry(transport, &asoc->peer.transport_addr_list,
transports) {
- sctp_transport_dst_release(transport);
sctp_transport_route(transport, NULL,
sctp_sk(asoc->base.sk));
}
@@ -1894,6 +1892,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo)
{
struct sock *sk = asoc->base.sk;
+ struct sctp_sock *sp = sctp_sk(sk);
struct net *net = sock_net(sk);
struct sctp_datamsg *datamsg;
bool wait_connect = false;
@@ -1912,13 +1911,16 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
goto err;
}
- if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) {
+ if (sp->disable_fragments && msg_len > asoc->frag_point) {
err = -EMSGSIZE;
goto err;
}
- if (asoc->pmtu_pending)
- sctp_assoc_pending_pmtu(asoc);
+ if (asoc->pmtu_pending) {
+ if (sp->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(asoc);
+ asoc->pmtu_pending = 0;
+ }
if (sctp_wspace(asoc) < msg_len)
sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
@@ -1935,7 +1937,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (err)
goto err;
- if (sctp_sk(sk)->strm_interleave) {
+ if (sp->strm_interleave) {
timeo = sock_sndtimeo(sk, 0);
err = sctp_wait_for_connect(asoc, &timeo);
if (err)
@@ -2538,7 +2540,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
trans->pathmtu = params->spp_pathmtu;
sctp_assoc_sync_pmtu(asoc);
} else if (asoc) {
- asoc->pathmtu = params->spp_pathmtu;
+ sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
} else {
sp->pathmtu = params->spp_pathmtu;
}
@@ -3208,7 +3210,6 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign
static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
{
struct sctp_sock *sp = sctp_sk(sk);
- struct sctp_af *af = sp->pf->af;
struct sctp_assoc_value params;
struct sctp_association *asoc;
int val;
@@ -3230,30 +3231,24 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
return -EINVAL;
}
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+
if (val) {
int min_len, max_len;
+ __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
+ sizeof(struct sctp_data_chunk);
- min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
- min_len -= af->ip_options_len(sk);
- min_len -= sizeof(struct sctphdr) +
- sizeof(struct sctp_data_chunk);
-
- max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
+ min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
+ datasize);
+ max_len = SCTP_MAX_CHUNK_LEN - datasize;
if (val < min_len || val > max_len)
return -EINVAL;
}
- asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
- if (val == 0) {
- val = asoc->pathmtu - af->net_header_len;
- val -= af->ip_options_len(sk);
- val -= sizeof(struct sctphdr) +
- sctp_datachk_len(&asoc->stream);
- }
asoc->user_frag = val;
- asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+ sctp_assoc_update_frag_point(asoc);
} else {
if (params.assoc_id && sctp_style(sk, UDP))
return -EINVAL;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 47f82bd794d9..445b7ef61677 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,9 +242,18 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
&transport->fl, sk);
}
- if (transport->dst) {
- transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
- } else
+ if (transport->param_flags & SPP_PMTUD_DISABLE) {
+ struct sctp_association *asoc = transport->asoc;
+
+ if (!transport->pathmtu && asoc && asoc->pathmtu)
+ transport->pathmtu = asoc->pathmtu;
+ if (transport->pathmtu)
+ return;
+ }
+
+ if (transport->dst)
+ transport->pathmtu = sctp_dst_mtu(transport->dst);
+ else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -290,6 +299,7 @@ void sctp_transport_route(struct sctp_transport *transport,
struct sctp_association *asoc = transport->asoc;
struct sctp_af *af = transport->af_specific;
+ sctp_transport_dst_release(transport);
af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
if (saddr)
@@ -297,21 +307,14 @@ void sctp_transport_route(struct sctp_transport *transport,
else
af->get_saddr(opt, transport, &transport->fl);
- if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
- return;
- }
- if (transport->dst) {
- transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
+ sctp_transport_pmtu(transport, sctp_opt2sk(opt));
- /* Initialize sk->sk_rcv_saddr, if the transport is the
- * association's active path for getsockname().
- */
- if (asoc && (!asoc->peer.primary_path ||
- (transport == asoc->peer.active_path)))
- opt->pf->to_sk_saddr(&transport->saddr,
- asoc->base.sk);
- } else
- transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+ /* Initialize sk->sk_rcv_saddr, if the transport is the
+ * association's active path for getsockname().
+ */
+ if (transport->dst && asoc &&
+ (!asoc->peer.primary_path || transport == asoc->peer.active_path))
+ opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk);
}
/* Hold a reference to a transport. */
@@ -634,7 +637,7 @@ unsigned long sctp_transport_timeout(struct sctp_transport *trans)
trans->state != SCTP_PF)
timeout += trans->hbinterval;
- return timeout;
+ return max_t(unsigned long, timeout, HZ / 5);
}
/* Reset transport variables to their initial values */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 544bab42f925..973b4471b532 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -8,8 +8,6 @@
*
* Initial restrictions:
* - support for alternate links postponed
- * - partial support for non-blocking sockets only
- * - support for urgent data postponed
*
* Copyright IBM Corp. 2016, 2018
*
@@ -29,6 +27,7 @@
#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
+#include <asm/ioctls.h>
#include "smc.h"
#include "smc_clc.h"
@@ -45,11 +44,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
* creation
*/
-struct smc_lgr_list smc_lgr_list = { /* established link groups */
- .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
- .list = LIST_HEAD_INIT(smc_lgr_list.list),
-};
-
static void smc_tcp_listen_work(struct work_struct *);
static void smc_set_keepalive(struct sock *sk, int val)
@@ -192,8 +186,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
sk->sk_protocol = protocol;
smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
+ spin_lock_init(&smc->conn.send_lock);
sk->sk_prot->hash(sk);
sk_refcnt_debug_inc(sk);
@@ -292,19 +288,28 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
-/* register a new rmb */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+/* register a new rmb, optionally send confirm_rkey msg to register with peer */
+static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
+ bool conf_rkey)
{
/* register memory region for new rmb */
if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
rmb_desc->regerr = 1;
return -EFAULT;
}
+ if (!conf_rkey)
+ return 0;
+ /* exchange confirm_rkey msg with peer */
+ if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
+ rmb_desc->regerr = 1;
+ return -EFAULT;
+ }
return 0;
}
static int smc_clnt_conf_first_link(struct smc_sock *smc)
{
+ struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
@@ -332,7 +337,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
smc_wr_remember_qp_attr(link);
- if (smc_reg_rmb(link, smc->conn.rmb_desc))
+ if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK response over RoCE fabric */
@@ -362,7 +367,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
if (rc < 0)
return SMC_CLC_DECL_TCL;
- link->state = SMC_LNK_ACTIVE;
+ smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
return 0;
}
@@ -370,10 +375,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
static void smc_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- smc->conn.peer_conn_idx = clc->conn_idx;
+ int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->rmbe_idx;
smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
- smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+ smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
static void smc_link_save_peer_info(struct smc_link *link,
@@ -386,160 +394,186 @@ static void smc_link_save_peer_info(struct smc_link *link,
link->peer_mtu = clc->qp_mtu;
}
-/* setup for RDMA connection of client */
-static int smc_connect_rdma(struct smc_sock *smc)
+/* fall back during connect */
+static int smc_connect_fallback(struct smc_sock *smc)
{
- struct smc_clc_msg_accept_confirm aclc;
- int local_contact = SMC_FIRST_CONTACT;
- struct smc_ib_device *smcibdev;
- struct smc_link *link;
- u8 srv_first_contact;
- int reason_code = 0;
- int rc = 0;
- u8 ibport;
+ smc->use_fallback = true;
+ smc_copy_sock_settings_to_clc(smc);
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+ return 0;
+}
- sock_hold(&smc->sk); /* sock put in passive closing */
+/* decline and fall back during connect */
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+{
+ int rc;
- if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
- /* peer has not signalled SMC-capability */
- smc->use_fallback = true;
- goto out_connected;
+ if (reason_code < 0) /* error, fallback is not possible */
+ return reason_code;
+ if (reason_code != SMC_CLC_DECL_REPLY) {
+ rc = smc_clc_send_decline(smc, reason_code);
+ if (rc < 0)
+ return rc;
}
+ return smc_connect_fallback(smc);
+}
- /* IPSec connections opt out of SMC-R optimizations */
- if (using_ipsec(smc)) {
- reason_code = SMC_CLC_DECL_IPSEC;
- goto decline_rdma;
- }
+/* abort connecting */
+static int smc_connect_abort(struct smc_sock *smc, int reason_code,
+ int local_contact)
+{
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(smc->conn.lgr);
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+ if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return reason_code;
+}
+
+/* check if there is a rdma device available for this connection. */
+/* called for connect and listen */
+static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
+ u8 *ibport)
+{
+ int reason_code = 0;
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
- smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
- if (!smcibdev) {
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
+ if (!(*ibdev))
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
- goto decline_rdma;
- }
+
+ return reason_code;
+}
+
+/* CLC handshake during connect */
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_ib_device *ibdev, u8 ibport)
+{
+ int rc = 0;
/* do inband token exchange */
- reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
- if (reason_code < 0) {
- rc = reason_code;
- goto out_err;
- }
- if (reason_code > 0) /* configuration error */
- goto decline_rdma;
+ rc = smc_clc_send_proposal(smc, ibdev, ibport);
+ if (rc)
+ return rc;
/* receive SMC Accept CLC message */
- reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
- SMC_CLC_ACCEPT);
- if (reason_code < 0) {
- rc = reason_code;
- goto out_err;
- }
- if (reason_code > 0)
- goto decline_rdma;
+ return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_ib_device *ibdev, u8 ibport)
+{
+ int local_contact = SMC_FIRST_CONTACT;
+ struct smc_link *link;
+ int reason_code = 0;
- srv_first_contact = aclc.hdr.flag;
mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
- srv_first_contact);
+ local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
+ aclc->hdr.flag);
if (local_contact < 0) {
- rc = local_contact;
- if (rc == -ENOMEM)
+ if (local_contact == -ENOMEM)
reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
- else if (rc == -ENOLINK)
+ else if (local_contact == -ENOLINK)
reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
- goto decline_rdma_unlock;
+ else
+ reason_code = SMC_CLC_DECL_INTERR; /* other error */
+ return smc_connect_abort(smc, reason_code, 0);
}
link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
- smc_conn_save_peer_info(smc, &aclc);
+ smc_conn_save_peer_info(smc, aclc);
/* create send buffer and rmb */
- rc = smc_buf_create(smc);
- if (rc) {
- reason_code = SMC_CLC_DECL_MEM;
- goto decline_rdma_unlock;
- }
+ if (smc_buf_create(smc))
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
if (local_contact == SMC_FIRST_CONTACT)
- smc_link_save_peer_info(link, &aclc);
+ smc_link_save_peer_info(link, aclc);
- rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
- if (rc) {
- reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
- }
+ if (smc_rmb_rtoken_handling(&smc->conn, aclc))
+ return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ local_contact);
smc_close_init(smc);
smc_rx_init(smc);
if (local_contact == SMC_FIRST_CONTACT) {
- rc = smc_ib_ready_link(link);
- if (rc) {
- reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
- }
+ if (smc_ib_ready_link(link))
+ return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ local_contact);
} else {
- if (!smc->conn.rmb_desc->reused) {
- if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
- reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
- }
- }
+ if (!smc->conn.rmb_desc->reused &&
+ smc_reg_rmb(link, smc->conn.rmb_desc, true))
+ return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ local_contact);
}
smc_rmb_sync_sg_for_device(&smc->conn);
- rc = smc_clc_send_confirm(smc);
- if (rc)
- goto out_err_unlock;
+ reason_code = smc_clc_send_confirm(smc);
+ if (reason_code)
+ return smc_connect_abort(smc, reason_code, local_contact);
+
+ smc_tx_init(smc);
if (local_contact == SMC_FIRST_CONTACT) {
/* QP confirmation over RoCE fabric */
reason_code = smc_clnt_conf_first_link(smc);
- if (reason_code < 0) {
- rc = reason_code;
- goto out_err_unlock;
- }
- if (reason_code > 0)
- goto decline_rdma_unlock;
+ if (reason_code)
+ return smc_connect_abort(smc, reason_code,
+ local_contact);
}
-
mutex_unlock(&smc_create_lgr_pending);
- smc_tx_init(smc);
-out_connected:
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
- return rc ? rc : local_contact;
+ return 0;
+}
-decline_rdma_unlock:
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(smc->conn.lgr);
- mutex_unlock(&smc_create_lgr_pending);
- smc_conn_free(&smc->conn);
-decline_rdma:
- /* RDMA setup failed, switch back to TCP */
- smc->use_fallback = true;
- if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
- rc = smc_clc_send_decline(smc, reason_code);
- if (rc < 0)
- goto out_err;
- }
- goto out_connected;
+/* perform steps before actually connecting */
+static int __smc_connect(struct smc_sock *smc)
+{
+ struct smc_clc_msg_accept_confirm aclc;
+ struct smc_ib_device *ibdev;
+ int rc = 0;
+ u8 ibport;
-out_err_unlock:
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(smc->conn.lgr);
- mutex_unlock(&smc_create_lgr_pending);
- smc_conn_free(&smc->conn);
-out_err:
- if (smc->sk.sk_state == SMC_INIT)
- sock_put(&smc->sk); /* passive closing */
- return rc;
+ sock_hold(&smc->sk); /* sock put in passive closing */
+
+ if (smc->use_fallback)
+ return smc_connect_fallback(smc);
+
+ /* if peer has not signalled SMC-capability, fall back */
+ if (!tcp_sk(smc->clcsock->sk)->syn_smc)
+ return smc_connect_fallback(smc);
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(smc))
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+
+ /* check if a RDMA device is available; if not, fall back */
+ if (smc_check_rdma(smc, &ibdev, &ibport))
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+
+ /* perform CLC handshake */
+ rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
+ if (rc)
+ return smc_connect_decline_fallback(smc, rc);
+
+ /* connect using rdma */
+ rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
+ if (rc)
+ return smc_connect_decline_fallback(smc, rc);
+
+ return 0;
}
static int smc_connect(struct socket *sock, struct sockaddr *addr,
@@ -575,8 +609,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
if (rc)
goto out;
- /* setup RDMA connection */
- rc = smc_connect_rdma(smc);
+ rc = __smc_connect(smc);
if (rc < 0)
goto out;
else
@@ -716,6 +749,7 @@ void smc_close_non_accepted(struct sock *sk)
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
+ struct net *net = sock_net(smc->clcsock->sk);
struct smc_link_group *lgr = smc->conn.lgr;
struct smc_link *link;
int rest;
@@ -723,7 +757,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
link = &lgr->lnk[SMC_SINGLE_LINK];
- if (smc_reg_rmb(link, smc->conn.rmb_desc))
+ if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
return SMC_CLC_DECL_INTERR;
/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -768,184 +802,244 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
return rc;
}
- link->state = SMC_LNK_ACTIVE;
+ smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
return 0;
}
-/* setup for RDMA connection of server */
-static void smc_listen_work(struct work_struct *work)
+/* listen worker: finish */
+static void smc_listen_out(struct smc_sock *new_smc)
{
- struct smc_sock *new_smc = container_of(work, struct smc_sock,
- smc_listen_work);
- struct smc_clc_msg_proposal_prefix *pclc_prfx;
- struct socket *newclcsock = new_smc->clcsock;
struct smc_sock *lsmc = new_smc->listen_smc;
- struct smc_clc_msg_accept_confirm cclc;
- int local_contact = SMC_REUSE_CONTACT;
struct sock *newsmcsk = &new_smc->sk;
- struct smc_clc_msg_proposal *pclc;
- struct smc_ib_device *smcibdev;
- u8 buf[SMC_CLC_MAX_LEN];
- struct smc_link *link;
- int reason_code = 0;
- int rc = 0;
- u8 ibport;
- /* check if peer is smc capable */
- if (!tcp_sk(newclcsock->sk)->syn_smc) {
- new_smc->use_fallback = true;
- goto out_connected;
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ } else { /* no longer listening */
+ smc_close_non_accepted(newsmcsk);
}
+ release_sock(&lsmc->sk);
- /* do inband token exchange -
- *wait for and receive SMC Proposal CLC message
- */
- reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
- SMC_CLC_PROPOSAL);
- if (reason_code < 0)
- goto out_err;
- if (reason_code > 0)
- goto decline_rdma;
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+}
- /* IPSec connections opt out of SMC-R optimizations */
- if (using_ipsec(new_smc)) {
- reason_code = SMC_CLC_DECL_IPSEC;
- goto decline_rdma;
- }
+/* listen worker: finish in state connected */
+static void smc_listen_out_connected(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
- /* PNET table look up: search active ib_device and port
- * within same PNETID that also contains the ethernet device
- * used for the internal TCP socket
- */
- smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
- if (!smcibdev) {
- reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
- goto decline_rdma;
+ sk_refcnt_debug_inc(newsmcsk);
+ if (newsmcsk->sk_state == SMC_INIT)
+ newsmcsk->sk_state = SMC_ACTIVE;
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: finish in error state */
+static void smc_listen_out_err(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (newsmcsk->sk_state == SMC_INIT)
+ sock_put(&new_smc->sk); /* passive closing */
+ newsmcsk->sk_state = SMC_CLOSED;
+ smc_conn_free(&new_smc->conn);
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: decline and fall back if possible */
+static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
+ int local_contact)
+{
+ /* RDMA setup failed, switch back to TCP */
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(new_smc->conn.lgr);
+ if (reason_code < 0) { /* error, no fallback possible */
+ smc_listen_out_err(new_smc);
+ return;
+ }
+ smc_conn_free(&new_smc->conn);
+ new_smc->use_fallback = true;
+ if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
+ if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+ smc_listen_out_err(new_smc);
+ return;
+ }
}
+ smc_listen_out_connected(new_smc);
+}
+
+/* listen worker: check prefixes */
+static int smc_listen_rdma_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct socket *newclcsock = new_smc->clcsock;
- pclc = (struct smc_clc_msg_proposal *)&buf;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (smc_clc_prfx_match(newclcsock, pclc_prfx))
+ return SMC_CLC_DECL_CNFERR;
- rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
- if (rc) {
- reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
- goto decline_rdma;
- }
+ return 0;
+}
+/* listen worker: initialize connection and buffers */
+static int smc_listen_rdma_init(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_ib_device *ibdev, u8 ibport,
+ int *local_contact)
+{
/* allocate connection / link group */
- mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
- 0);
- if (local_contact < 0) {
- rc = local_contact;
- if (rc == -ENOMEM)
- reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
- goto decline_rdma_unlock;
+ *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
+ if (*local_contact < 0) {
+ if (*local_contact == -ENOMEM)
+ return SMC_CLC_DECL_MEM;/* insufficient memory*/
+ return SMC_CLC_DECL_INTERR; /* other error */
}
- link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
/* create send buffer and rmb */
- rc = smc_buf_create(new_smc);
- if (rc) {
- reason_code = SMC_CLC_DECL_MEM;
- goto decline_rdma_unlock;
- }
+ if (smc_buf_create(new_smc))
+ return SMC_CLC_DECL_MEM;
- smc_close_init(new_smc);
- smc_rx_init(new_smc);
+ return 0;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+{
+ struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
if (local_contact != SMC_FIRST_CONTACT) {
if (!new_smc->conn.rmb_desc->reused) {
- if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
- reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
- }
+ if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
+ return SMC_CLC_DECL_INTERR;
}
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
- rc = smc_clc_send_accept(new_smc, local_contact);
- if (rc)
- goto out_err_unlock;
+ return 0;
+}
+
+/* listen worker: finish RDMA setup */
+static void smc_listen_rdma_finish(struct smc_sock *new_smc,
+ struct smc_clc_msg_accept_confirm *cclc,
+ int local_contact)
+{
+ struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+ int reason_code = 0;
- /* receive SMC Confirm CLC message */
- reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
- SMC_CLC_CONFIRM);
- if (reason_code < 0)
- goto out_err_unlock;
- if (reason_code > 0)
- goto decline_rdma_unlock;
- smc_conn_save_peer_info(new_smc, &cclc);
if (local_contact == SMC_FIRST_CONTACT)
- smc_link_save_peer_info(link, &cclc);
+ smc_link_save_peer_info(link, cclc);
- rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
- if (rc) {
+ if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
+ goto decline;
}
if (local_contact == SMC_FIRST_CONTACT) {
- rc = smc_ib_ready_link(link);
- if (rc) {
+ if (smc_ib_ready_link(link)) {
reason_code = SMC_CLC_DECL_INTERR;
- goto decline_rdma_unlock;
+ goto decline;
}
/* QP confirmation over RoCE fabric */
reason_code = smc_serv_conf_first_link(new_smc);
- if (reason_code < 0)
- /* peer is not aware of a problem */
- goto out_err_unlock;
- if (reason_code > 0)
- goto decline_rdma_unlock;
+ if (reason_code)
+ goto decline;
}
+ return;
- smc_tx_init(new_smc);
+decline:
mutex_unlock(&smc_create_lgr_pending);
+ smc_listen_decline(new_smc, reason_code, local_contact);
+}
-out_connected:
- sk_refcnt_debug_inc(newsmcsk);
- if (newsmcsk->sk_state == SMC_INIT)
- newsmcsk->sk_state = SMC_ACTIVE;
-enqueue:
- lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
- if (lsmc->sk.sk_state == SMC_LISTEN) {
- smc_accept_enqueue(&lsmc->sk, newsmcsk);
- } else { /* no longer listening */
- smc_close_non_accepted(newsmcsk);
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_clc_msg_accept_confirm cclc;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_ib_device *ibdev;
+ u8 buf[SMC_CLC_MAX_LEN];
+ int local_contact = 0;
+ int reason_code = 0;
+ int rc = 0;
+ u8 ibport;
+
+ if (new_smc->use_fallback) {
+ smc_listen_out_connected(new_smc);
+ return;
}
- release_sock(&lsmc->sk);
- /* Wake up accept */
- lsmc->sk.sk_data_ready(&lsmc->sk);
- sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
- return;
+ /* check if peer is smc capable */
+ if (!tcp_sk(newclcsock->sk)->syn_smc) {
+ new_smc->use_fallback = true;
+ smc_listen_out_connected(new_smc);
+ return;
+ }
-decline_rdma_unlock:
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- mutex_unlock(&smc_create_lgr_pending);
-decline_rdma:
- /* RDMA setup failed, switch back to TCP */
- smc_conn_free(&new_smc->conn);
- new_smc->use_fallback = true;
- if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
- if (smc_clc_send_decline(new_smc, reason_code) < 0)
- goto out_err;
+ /* do inband token exchange -
+ * wait for and receive SMC Proposal CLC message
+ */
+ pclc = (struct smc_clc_msg_proposal *)&buf;
+ reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+ SMC_CLC_PROPOSAL);
+ if (reason_code) {
+ smc_listen_decline(new_smc, reason_code, 0);
+ return;
}
- goto out_connected;
-out_err_unlock:
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(new_smc)) {
+ smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
+ return;
+ }
+
+ mutex_lock(&smc_create_lgr_pending);
+ smc_close_init(new_smc);
+ smc_rx_init(new_smc);
+ smc_tx_init(new_smc);
+
+ /* check if RDMA is available */
+ if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
+ smc_listen_rdma_check(new_smc, pclc) ||
+ smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
+ &local_contact) ||
+ smc_listen_rdma_reg(new_smc, local_contact)) {
+ /* SMC not supported, decline */
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
+ return;
+ }
+
+ /* send SMC Accept CLC message */
+ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc) {
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_listen_decline(new_smc, rc, local_contact);
+ return;
+ }
+
+ /* receive SMC Confirm CLC message */
+ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ SMC_CLC_CONFIRM);
+ if (reason_code) {
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_listen_decline(new_smc, reason_code, local_contact);
+ return;
+ }
+
+ /* finish worker */
+ smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+ smc_conn_save_peer_info(new_smc, &cclc);
mutex_unlock(&smc_create_lgr_pending);
-out_err:
- if (newsmcsk->sk_state == SMC_INIT)
- sock_put(&new_smc->sk); /* passive closing */
- newsmcsk->sk_state = SMC_CLOSED;
- smc_conn_free(&new_smc->conn);
- goto enqueue; /* queue new sock with sk_err set */
+ smc_listen_out_connected(new_smc);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -965,7 +1059,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
continue;
new_smc->listen_smc = lsmc;
- new_smc->use_fallback = false; /* assume rdma capability first*/
+ new_smc->use_fallback = lsmc->use_fallback;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
@@ -1001,7 +1095,8 @@ static int smc_listen(struct socket *sock, int backlog)
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
- tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (!smc->use_fallback)
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
@@ -1034,6 +1129,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (lsmc->sk.sk_state != SMC_LISTEN) {
rc = -EINVAL;
+ release_sock(sk);
goto out;
}
@@ -1061,9 +1157,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
if (!rc)
rc = sock_error(nsk);
+ release_sock(sk);
+ if (rc)
+ goto out;
+
+ if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+ /* wait till data arrives on the socket */
+ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
+ MSEC_PER_SEC);
+ if (smc_sk(nsk)->use_fallback) {
+ struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
+
+ lock_sock(clcsk);
+ if (skb_queue_empty(&clcsk->sk_receive_queue))
+ sk_wait_data(clcsk, &timeo, NULL);
+ release_sock(clcsk);
+ } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
+ lock_sock(nsk);
+ smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
+ release_sock(nsk);
+ }
+ }
out:
- release_sock(sk);
sock_put(sk); /* sock_hold above */
return rc;
}
@@ -1094,6 +1210,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_INIT))
goto out;
+
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ if (sk->sk_state == SMC_INIT) {
+ smc->use_fallback = true;
+ } else {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
if (smc->use_fallback)
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
else
@@ -1122,10 +1248,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
goto out;
}
- if (smc->use_fallback)
+ if (smc->use_fallback) {
rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
- else
- rc = smc_rx_recvmsg(smc, msg, len, flags);
+ } else {
+ msg->msg_namelen = 0;
+ rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+ }
out:
release_sock(sk);
@@ -1172,7 +1300,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
if (sk->sk_state == SMC_INIT &&
mask & EPOLLOUT &&
smc->clcsock->sk->sk_state != TCP_CLOSE) {
- rc = smc_connect_rdma(smc);
+ rc = __smc_connect(smc);
if (rc < 0)
mask |= EPOLLERR;
/* success cases including fallback */
@@ -1208,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
if (sk->sk_state == SMC_APPCLOSEWAIT1)
mask |= EPOLLIN;
}
+ if (smc->conn.urg_state == SMC_URG_VALID)
+ mask |= EPOLLPRI;
}
release_sock(sk);
@@ -1273,14 +1403,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
+ int val, rc;
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
- return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ if (smc->clcsock->sk->sk_err) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ sk->sk_error_report(sk);
+ }
+ if (rc)
+ return rc;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ get_user(val, (int __user *)optval);
+
+ lock_sock(sk);
+ switch (optname) {
+ case TCP_ULP:
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ /* option not supported by SMC */
+ if (sk->sk_state == SMC_INIT) {
+ smc->use_fallback = true;
+ } else {
+ if (!smc->use_fallback)
+ rc = -EINVAL;
+ }
+ break;
+ case TCP_NODELAY:
+ if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+ if (val && !smc->use_fallback)
+ mod_delayed_work(system_wq, &smc->conn.tx_work,
+ 0);
+ }
+ break;
+ case TCP_CORK:
+ if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+ if (!val && !smc->use_fallback)
+ mod_delayed_work(system_wq, &smc->conn.tx_work,
+ 0);
+ }
+ break;
+ case TCP_DEFER_ACCEPT:
+ smc->sockopt_defer_accept = val;
+ break;
+ default:
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
}
static int smc_getsockopt(struct socket *sock, int level, int optname,
@@ -1297,13 +1477,71 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
static int smc_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
+ union smc_host_cursor cons, urg;
+ struct smc_connection *conn;
struct smc_sock *smc;
+ int answ;
smc = smc_sk(sock->sk);
- if (smc->use_fallback)
+ conn = &smc->conn;
+ if (smc->use_fallback) {
+ if (!smc->clcsock)
+ return -EBADF;
return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
- else
- return sock_no_ioctl(sock, cmd, arg);
+ }
+ switch (cmd) {
+ case SIOCINQ: /* same as FIONREAD */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = atomic_read(&smc->conn.bytes_to_rcv);
+ break;
+ case SIOCOUTQ:
+ /* output queue size (not send + not acked) */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc->conn.sndbuf_desc->len -
+ atomic_read(&smc->conn.sndbuf_space);
+ break;
+ case SIOCOUTQNSD:
+ /* output queue size (not send only) */
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc_tx_prepared_sends(&smc->conn);
+ break;
+ case SIOCATMARK:
+ if (smc->sk.sk_state == SMC_LISTEN)
+ return -EINVAL;
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED) {
+ answ = 0;
+ } else {
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ smc_curs_write(&urg,
+ smc_curs_read(&conn->urg_curs, conn),
+ conn);
+ answ = smc_curs_diff(conn->rmb_desc->len,
+ &cons, &urg) == 1;
+ }
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
}
static ssize_t smc_sendpage(struct socket *sock, struct page *page,
@@ -1330,9 +1568,15 @@ out:
return rc;
}
+/* Map the affected portions of the rmbe into an spd, note the number of bytes
+ * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
+ * updates till whenever a respective page has been fully processed.
+ * Note that subsequent recv() calls have to wait till all splice() processing
+ * completed.
+ */
static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+ unsigned int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
@@ -1340,16 +1584,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
smc = smc_sk(sk);
lock_sock(sk);
- if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+
+ if (sk->sk_state == SMC_INIT ||
+ sk->sk_state == SMC_LISTEN ||
+ sk->sk_state == SMC_CLOSED)
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
goto out;
+ }
+
if (smc->use_fallback) {
rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
pipe, len, flags);
} else {
- rc = -EOPNOTSUPP;
+ if (*ppos) {
+ rc = -ESPIPE;
+ goto out;
+ }
+ if (flags & SPLICE_F_NONBLOCK)
+ flags = MSG_DONTWAIT;
+ else
+ flags = 0;
+ rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
}
out:
release_sock(sk);
+
return rc;
}
@@ -1482,18 +1744,7 @@ out_pnet:
static void __exit smc_exit(void)
{
- struct smc_link_group *lgr, *lg;
- LIST_HEAD(lgr_freeing_list);
-
- spin_lock_bh(&smc_lgr_list.lock);
- if (!list_empty(&smc_lgr_list.list))
- list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
- spin_unlock_bh(&smc_lgr_list.lock);
- list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
- list_del_init(&lgr->list);
- cancel_delayed_work_sync(&lgr->free_work);
- smc_lgr_free(lgr); /* free link group */
- }
+ smc_core_exit();
static_branch_disable(&tcp_have_smc);
smc_ib_unregister_client();
sock_unregister(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e4829a2f46ba..51ae1f10d81a 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -114,11 +114,17 @@ struct smc_host_cdc_msg { /* Connection Data Control message */
u8 reserved[18];
} __aligned(8);
+enum smc_urg_state {
+ SMC_URG_VALID, /* data present */
+ SMC_URG_NOTYET, /* data pending */
+ SMC_URG_READ /* data was already read */
+};
+
struct smc_connection {
struct rb_node alert_node;
struct smc_link_group *lgr; /* link group of connection */
u32 alert_token_local; /* unique conn. id */
- u8 peer_conn_idx; /* from tcp handshake */
+ u8 peer_rmbe_idx; /* from tcp handshake */
int peer_rmbe_size; /* size of peer rx buffer */
atomic_t peer_rmbe_space;/* remaining free bytes in peer
* rmbe
@@ -126,9 +132,7 @@ struct smc_connection {
int rtoken_idx; /* idx to peer RMB rkey/addr */
struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
- int sndbuf_size; /* sndbuf size <== sock wmem */
struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
- int rmbe_size; /* RMBE size <== sock rmem */
int rmbe_size_short;/* compressed notation */
int rmbe_update_limit;
/* lower limit for consumer
@@ -153,6 +157,7 @@ struct smc_connection {
u16 tx_cdc_seq; /* sequence # for CDC send */
spinlock_t send_lock; /* protect wr_sends */
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
+ u32 tx_off; /* base offset in peer rmb */
struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
* .prod cf. TCP rcv_nxt
@@ -161,9 +166,21 @@ struct smc_connection {
union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
* source of snd_una ?
*/
+ union smc_host_cursor urg_curs; /* points at urgent byte */
+ enum smc_urg_state urg_state;
+ bool urg_tx_pend; /* urgent data staged */
+ bool urg_rx_skip_pend;
+ /* indicate urgent oob data
+ * read, but previous regular
+ * data still pending
+ */
+ char urg_rx_byte; /* urgent byte */
atomic_t bytes_to_rcv; /* arrived data,
* not yet received
*/
+ atomic_t splice_pending; /* number of spliced bytes
+ * pending processing
+ */
#ifndef KERNEL_HAS_ATOMIC64
spinlock_t acurs_lock; /* protect cursors */
#endif
@@ -180,6 +197,10 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */
+ int sockopt_defer_accept;
+ /* sockopt TCP_DEFER_ACCEPT
+ * value
+ */
u8 wait_close_tx_prepared : 1;
/* shutdown wr or close
* started, waiting for unsent
@@ -214,41 +235,6 @@ static inline u32 ntoh24(u8 *net)
return be32_to_cpu(t);
}
-#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
-
-#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
-/* theoretically, the RFC states that largest size would be 512K,
- * i.e. compressed 5 and thus 6 sizes (0..5), despite
- * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
- */
-
-/* convert the RMB size into the compressed notation - minimum 16K.
- * In contrast to plain ilog2, this rounds towards the next power of 2,
- * so the socket application gets at least its desired sndbuf / rcvbuf size.
- */
-static inline u8 smc_compress_bufsize(int size)
-{
- u8 compressed;
-
- if (size <= SMC_BUF_MIN_SIZE)
- return 0;
-
- size = (size - 1) >> 14;
- compressed = ilog2(size) + 1;
- if (compressed >= SMC_RMBE_SIZES)
- compressed = SMC_RMBE_SIZES - 1;
- return compressed;
-}
-
-/* convert the RMB size from compressed notation into integer */
-static inline int smc_uncompress_bufsize(u8 compressed)
-{
- u32 size;
-
- size = 0x00000001 << (((int)compressed) + 14);
- return (int)size;
-}
-
#ifdef CONFIG_XFRM
static inline bool using_ipsec(struct smc_sock *smc)
{
@@ -262,12 +248,6 @@ static inline bool using_ipsec(struct smc_sock *smc)
}
#endif
-struct smc_clc_msg_local;
-
-void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc,
- struct smc_ib_device *smcibdev, u8 ibport,
- struct smc_clc_msg_local *lcl, int srv_first_contact);
struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
void smc_close_non_accepted(struct sock *sk);
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b42395d24cba..a7e8d63fc8ae 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
smc = container_of(cdcpend->conn, struct smc_sock, conn);
bh_lock_sock(&smc->sk);
if (!wc_status) {
- diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+ diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
&cdcpend->conn->tx_curs_fin,
&cdcpend->cursor);
/* sndbuf_space is decreased in smc_sendmsg */
smp_mb__before_atomic();
atomic_add(diff, &cdcpend->conn->sndbuf_space);
- /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
smc_curs_write(&cdcpend->conn->tx_curs_fin,
smc_curs_read(&cdcpend->cursor, cdcpend->conn),
@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
BUILD_BUG_ON_MSG(
- offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
BUILD_BUG_ON_MSG(
sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
@@ -164,20 +164,35 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
return (s16)(seq1 - seq2) < 0;
}
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+ int *diff_prod)
+{
+ struct smc_connection *conn = &smc->conn;
+ char *base;
+
+ /* new data included urgent business */
+ smc_curs_write(&conn->urg_curs,
+ smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+ conn);
+ conn->urg_state = SMC_URG_VALID;
+ if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+ /* we'll skip the urgent byte, so don't account for it */
+ (*diff_prod)--;
+ base = (char *)conn->rmb_desc->cpu_addr;
+ if (conn->urg_curs.count)
+ conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+ else
+ conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+ sk_send_sigurg(&smc->sk);
+}
+
static void smc_cdc_msg_recv_action(struct smc_sock *smc,
- struct smc_link *link,
struct smc_cdc_msg *cdc)
{
union smc_host_cursor cons_old, prod_old;
struct smc_connection *conn = &smc->conn;
int diff_cons, diff_prod;
- if (!cdc->prod_flags.failover_validation) {
- if (smc_cdc_before(ntohs(cdc->seqno),
- conn->local_rx_ctrl.seqno))
- /* received seqno is old */
- return;
- }
smc_curs_write(&prod_old,
smc_curs_read(&conn->local_rx_ctrl.prod, conn),
conn);
@@ -198,18 +213,28 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
smp_mb__after_atomic();
}
- diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+ diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
&conn->local_rx_ctrl.prod);
if (diff_prod) {
+ if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+ smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
/* bytes_to_rcv is decreased in smc_recvmsg */
smp_mb__before_atomic();
atomic_add(diff_prod, &conn->bytes_to_rcv);
- /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
smp_mb__after_atomic();
smc->sk.sk_data_ready(&smc->sk);
- } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
- (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
- smc->sk.sk_data_ready(&smc->sk);
+ } else {
+ if (conn->local_rx_ctrl.prod_flags.write_blocked ||
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+ if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+ conn->urg_state = SMC_URG_NOTYET;
+ /* force immediate tx of current consumer cursor, but
+ * under send_lock to guarantee arrival in seqno-order
+ */
+ smc_tx_sndbuf_nonempty(conn);
+ }
}
/* piggy backed tx info */
@@ -219,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* trigger socket release if connection closed */
smc_close_wake_tx_prepared(smc);
}
+ if (diff_cons && conn->urg_tx_pend &&
+ atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+ /* urg data confirmed by peer, indicate we're ready for more */
+ conn->urg_tx_pend = false;
+ smc->sk.sk_write_space(&smc->sk);
+ }
if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
smc->sk.sk_err = ECONNRESET;
@@ -236,26 +267,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
}
/* called under tasklet context */
-static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
- struct smc_link *link, u64 wr_id)
+static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- struct smc_connection *connection;
- struct smc_sock *smc;
-
- /* lookup connection */
- read_lock_bh(&lgr->conns_lock);
- connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
- if (!connection) {
- read_unlock_bh(&lgr->conns_lock);
- return;
- }
- smc = container_of(connection, struct smc_sock, conn);
sock_hold(&smc->sk);
- read_unlock_bh(&lgr->conns_lock);
bh_lock_sock(&smc->sk);
- smc_cdc_msg_recv_action(smc, link, cdc);
+ smc_cdc_msg_recv_action(smc, cdc);
bh_unlock_sock(&smc->sk);
sock_put(&smc->sk); /* no free sk in softirq-context */
}
@@ -266,12 +282,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
{
struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
struct smc_cdc_msg *cdc = buf;
+ struct smc_connection *conn;
+ struct smc_link_group *lgr;
+ struct smc_sock *smc;
if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
return; /* short message */
if (cdc->len != SMC_WR_TX_SIZE)
return; /* invalid message */
- smc_cdc_msg_recv(cdc, link, wc->wr_id);
+
+ /* lookup connection */
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ read_lock_bh(&lgr->conns_lock);
+ conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conn)
+ return;
+ smc = container_of(conn, struct smc_sock, conn);
+
+ if (!cdc->prod_flags.failover_validation) {
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+ }
+ smc_cdc_msg_recv(smc, cdc);
}
static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index ab240b37ad11..f60082fee5b8 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -48,7 +48,7 @@ struct smc_cdc_msg {
struct smc_cdc_producer_flags prod_flags;
struct smc_cdc_conn_state_flags conn_state_flags;
u8 reserved[18];
-} __aligned(8);
+} __packed; /* format defined in RFC7609 */
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{
@@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size,
return max_t(int, 0, (new->count - old->count));
}
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap > new->wrap ||
+ (old->wrap == new->wrap && old->count > new->count))
+ return -smc_curs_diff(size, new, old);
+ return smc_curs_diff(size, old, new);
+}
+
static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
union smc_host_cursor *local,
struct smc_connection *conn)
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 3a988c22f627..717449b1da0b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -316,7 +316,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
if (clcm->type == SMC_CLC_DECLINE) {
reason_code = SMC_CLC_DECL_REPLY;
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
- smc->conn.lgr->sync_err = true;
+ smc->conn.lgr->sync_err = 1;
smc_lgr_terminate(smc->conn.lgr);
}
}
@@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
hton24(cclc.qpn, link->roce_qp->qp_num);
cclc.rmb_rkey =
htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
cclc.rmbe_alert_token = htonl(conn->alert_token_local);
cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
cclc.rmbe_size = conn->rmbe_size_short;
@@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
hton24(aclc.qpn, link->roce_qp->qp_num);
aclc.rmb_rkey =
htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
aclc.rmbe_alert_token = htonl(conn->alert_token_local);
aclc.qp_mtu = link->path_mtu;
aclc.rmbe_size = conn->rmbe_size_short,
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 63bf1dc2c1f9..41ff9ea96139 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_local lcl;
u8 qpn[3]; /* QP number */
__be32 rmb_rkey; /* RMB rkey */
- u8 conn_idx; /* Connection index, which RMBE in RMB */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
__be32 rmbe_alert_token;/* unique connection id */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d4bd01bb44e1..add82b0266f3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -28,12 +28,16 @@
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
-#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10)
+#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
-static u32 smc_lgr_num; /* unique link group number */
+static struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+ .num = 0,
+};
-static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
- bool is_rmb);
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc);
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
@@ -148,8 +152,11 @@ static void smc_lgr_free_work(struct work_struct *work)
list_del_init(&lgr->list); /* remove from smc_lgr_list */
free:
spin_unlock_bh(&smc_lgr_list.lock);
- if (!delayed_work_pending(&lgr->free_work))
+ if (!delayed_work_pending(&lgr->free_work)) {
+ if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
smc_lgr_free(lgr);
+ }
}
/* create a new SMC link group */
@@ -169,7 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc,
goto out;
}
lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- lgr->sync_err = false;
+ lgr->sync_err = 0;
memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
lgr->vlan_id = vlan_id;
rwlock_init(&lgr->sndbufs_lock);
@@ -178,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc,
INIT_LIST_HEAD(&lgr->sndbufs[i]);
INIT_LIST_HEAD(&lgr->rmbs[i]);
}
- smc_lgr_num += SMC_LGR_NUM_INCR;
- memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
+ smc_lgr_list.num += SMC_LGR_NUM_INCR;
+ memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
lgr->conns_all = RB_ROOT;
@@ -194,9 +201,12 @@ static int smc_lgr_create(struct smc_sock *smc,
smc_ib_setup_per_ibdev(smcibdev);
get_random_bytes(rndvec, sizeof(rndvec));
lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
- rc = smc_wr_alloc_link_mem(lnk);
+ rc = smc_llc_link_init(lnk);
if (rc)
goto free_lgr;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
rc = smc_ib_create_protection_domain(lnk);
if (rc)
goto free_link_mem;
@@ -206,10 +216,6 @@ static int smc_lgr_create(struct smc_sock *smc,
rc = smc_wr_create_link(lnk);
if (rc)
goto destroy_qp;
- init_completion(&lnk->llc_confirm);
- init_completion(&lnk->llc_confirm_resp);
- init_completion(&lnk->llc_add);
- init_completion(&lnk->llc_add_resp);
smc->conn.lgr = lgr;
rwlock_init(&lgr->conns_lock);
@@ -224,6 +230,8 @@ dealloc_pd:
smc_ib_dealloc_protection_domain(lnk);
free_link_mem:
smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+ smc_llc_link_clear(lnk);
free_lgr:
kfree(lgr);
out:
@@ -232,26 +240,21 @@ out:
static void smc_buf_unuse(struct smc_connection *conn)
{
- if (conn->sndbuf_desc) {
+ if (conn->sndbuf_desc)
conn->sndbuf_desc->used = 0;
- conn->sndbuf_size = 0;
- }
if (conn->rmb_desc) {
if (!conn->rmb_desc->regerr) {
conn->rmb_desc->reused = 1;
conn->rmb_desc->used = 0;
- conn->rmbe_size = 0;
} else {
/* buf registration failed, reuse not possible */
struct smc_link_group *lgr = conn->lgr;
- struct smc_link *lnk;
write_lock_bh(&lgr->rmbs_lock);
list_del(&conn->rmb_desc->list);
write_unlock_bh(&lgr->rmbs_lock);
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- smc_buf_free(conn->rmb_desc, lnk, true);
+ smc_buf_free(lgr, true, conn->rmb_desc);
}
}
}
@@ -269,6 +272,7 @@ void smc_conn_free(struct smc_connection *conn)
static void smc_link_clear(struct smc_link *lnk)
{
lnk->peer_qpn = 0;
+ smc_llc_link_clear(lnk);
smc_ib_modify_qp_reset(lnk);
smc_wr_free_link(lnk);
smc_ib_destroy_queue_pair(lnk);
@@ -276,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk)
smc_wr_free_link_mem(lnk);
}
-static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
- bool is_rmb)
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
{
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
if (is_rmb) {
if (buf_desc->mr_rx[SMC_SINGLE_LINK])
smc_ib_put_memory_region(
@@ -290,14 +296,13 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
DMA_TO_DEVICE);
}
sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
- if (buf_desc->cpu_addr)
- free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order);
+ if (buf_desc->pages)
+ __free_pages(buf_desc->pages, buf_desc->order);
kfree(buf_desc);
}
static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
{
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
struct smc_buf_desc *buf_desc, *bf_desc;
struct list_head *buf_list;
int i;
@@ -310,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
list) {
list_del(&buf_desc->list);
- smc_buf_free(buf_desc, lnk, is_rmb);
+ smc_buf_free(lgr, is_rmb, buf_desc);
}
}
}
@@ -341,13 +346,18 @@ void smc_lgr_forget(struct smc_link_group *lgr)
}
/* terminate linkgroup abnormally */
-void smc_lgr_terminate(struct smc_link_group *lgr)
+static void __smc_lgr_terminate(struct smc_link_group *lgr)
{
struct smc_connection *conn;
struct smc_sock *smc;
struct rb_node *node;
- smc_lgr_forget(lgr);
+ if (lgr->terminating)
+ return; /* lgr already terminating */
+ lgr->terminating = 1;
+ if (!list_empty(&lgr->list)) /* forget lgr */
+ list_del_init(&lgr->list);
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
write_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
@@ -368,13 +378,35 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
smc_lgr_schedule_free_work(lgr);
}
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+ spin_lock_bh(&smc_lgr_list.lock);
+ __smc_lgr_terminate(lgr);
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
+/* Called when IB port is terminated */
+void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *l;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+ if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+ lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+ __smc_lgr_terminate(lgr);
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
/* Determine vlan of internal TCP socket.
* @vlan_id: address to store the determined vlan id into
*/
static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
- int rc = 0;
+ struct net_device *ndev;
+ int i, nest_lvl, rc = 0;
*vlan_id = 0;
if (!dst) {
@@ -386,8 +418,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
goto out_rel;
}
- if (is_vlan_dev(dst->dev))
- *vlan_id = vlan_dev_vlan_id(dst->dev);
+ ndev = dst->dev;
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ goto out_rel;
+ }
+
+ rtnl_lock();
+ nest_lvl = dev_get_nest_level(ndev);
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ break;
+ }
+ }
+ rtnl_unlock();
out_rel:
dst_release(dst);
@@ -432,10 +483,10 @@ int smc_conn_create(struct smc_sock *smc,
struct smc_clc_msg_local *lcl, int srv_first_contact)
{
struct smc_connection *conn = &smc->conn;
+ int local_contact = SMC_FIRST_CONTACT;
struct smc_link_group *lgr;
unsigned short vlan_id;
enum smc_lgr_role role;
- int local_contact = SMC_FIRST_CONTACT;
int rc = 0;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -493,6 +544,7 @@ create:
}
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+ conn->urg_state = SMC_URG_READ;
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&conn->acurs_lock);
#endif
@@ -501,14 +553,39 @@ out:
return rc ? rc : local_contact;
}
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static u8 smc_compress_bufsize(int size)
+{
+ u8 compressed;
+
+ if (size <= SMC_BUF_MIN_SIZE)
+ return 0;
+
+ size = (size - 1) >> 14;
+ compressed = ilog2(size) + 1;
+ if (compressed >= SMC_RMBE_SIZES)
+ compressed = SMC_RMBE_SIZES - 1;
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
/* try to reuse a sndbuf or rmb description slot for a certain
* buffer size; if not available, return NULL
*/
-static inline
-struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr,
- int compressed_bufsize,
- rwlock_t *lock,
- struct list_head *buf_list)
+static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
+ rwlock_t *lock,
+ struct list_head *buf_list)
{
struct smc_buf_desc *buf_slot;
@@ -544,23 +621,23 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
if (!buf_desc)
return ERR_PTR(-ENOMEM);
- buf_desc->cpu_addr =
- (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
- __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_ZERO,
- get_order(bufsize));
- if (!buf_desc->cpu_addr) {
+ buf_desc->order = get_order(bufsize);
+ buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | __GFP_COMP |
+ __GFP_NORETRY | __GFP_ZERO,
+ buf_desc->order);
+ if (!buf_desc->pages) {
kfree(buf_desc);
return ERR_PTR(-EAGAIN);
}
- buf_desc->order = get_order(bufsize);
+ buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
/* build the sg table from the pages */
lnk = &lgr->lnk[SMC_SINGLE_LINK];
rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
GFP_KERNEL);
if (rc) {
- smc_buf_free(buf_desc, lnk, is_rmb);
+ smc_buf_free(lgr, is_rmb, buf_desc);
return ERR_PTR(rc);
}
sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
@@ -571,7 +648,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
/* SMC protocol depends on mapping to one DMA address only */
if (rc != 1) {
- smc_buf_free(buf_desc, lnk, is_rmb);
+ smc_buf_free(lgr, is_rmb, buf_desc);
return ERR_PTR(-EAGAIN);
}
@@ -582,19 +659,20 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
IB_ACCESS_LOCAL_WRITE,
buf_desc);
if (rc) {
- smc_buf_free(buf_desc, lnk, is_rmb);
+ smc_buf_free(lgr, is_rmb, buf_desc);
return ERR_PTR(rc);
}
}
+ buf_desc->len = bufsize;
return buf_desc;
}
static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
{
+ struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
struct smc_connection *conn = &smc->conn;
struct smc_link_group *lgr = conn->lgr;
- struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
struct list_head *buf_list;
int bufsize, bufsize_short;
int sk_buf_size;
@@ -622,7 +700,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
continue;
/* check for reusable slot in the link group */
- buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list);
+ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
if (buf_desc) {
memset(buf_desc->cpu_addr, 0, bufsize);
break; /* found reusable slot */
@@ -646,14 +724,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
if (is_rmb) {
conn->rmb_desc = buf_desc;
- conn->rmbe_size = bufsize;
conn->rmbe_size_short = bufsize_short;
smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
} else {
conn->sndbuf_desc = buf_desc;
- conn->sndbuf_size = bufsize;
smc->sk.sk_sndbuf = bufsize * 2;
atomic_set(&conn->sndbuf_space, bufsize);
}
@@ -709,8 +785,7 @@ int smc_buf_create(struct smc_sock *smc)
/* create rmb */
rc = __smc_buf_create(smc, true);
if (rc)
- smc_buf_free(smc->conn.sndbuf_desc,
- &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false);
+ smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
return rc;
}
@@ -777,3 +852,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
return conn->rtoken_idx;
return 0;
}
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_freeing_list);
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!list_empty(&smc_lgr_list.list))
+ list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+ list_del_init(&lgr->list);
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ cancel_delayed_work_sync(&lgr->free_work);
+ smc_lgr_free(lgr); /* free link group */
+ }
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 5dfcb15d529f..93cb3523bf50 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -23,10 +23,9 @@
struct smc_lgr_list { /* list of link group definition */
struct list_head list;
spinlock_t lock; /* protects list of link groups */
+ u32 num; /* unique link group number */
};
-extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
-
enum smc_lgr_role { /* possible roles of a link group */
SMC_CLNT, /* client */
SMC_SERV /* server */
@@ -79,6 +78,7 @@ struct smc_link {
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
u64 wr_rx_id; /* seq # of last recv WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
+ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
struct ib_reg_wr wr_reg; /* WR register memory region */
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
@@ -95,12 +95,18 @@ struct smc_link {
u8 link_id; /* unique # within link group */
enum smc_link_state state; /* state of link */
+ struct workqueue_struct *llc_wq; /* single thread work queue */
struct completion llc_confirm; /* wait for rx of conf link */
struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
int llc_confirm_rc; /* rc from confirm link msg */
int llc_confirm_resp_rc; /* rc from conf_resp msg */
struct completion llc_add; /* wait for rx of add link */
struct completion llc_add_resp; /* wait for rx of add link rsp*/
+ struct delayed_work llc_testlink_wrk; /* testlink worker */
+ struct completion llc_testlink_resp; /* wait for rx of testlink */
+ int llc_testlink_time; /* testlink interval */
+ struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */
+ int llc_confirm_rkey_rc; /* rc from cnf rkey msg */
};
/* For now we just allow one parallel link per link group. The SMC protocol
@@ -116,6 +122,8 @@ struct smc_link {
struct smc_buf_desc {
struct list_head list;
void *cpu_addr; /* virtual address of buffer */
+ struct page *pages;
+ int len; /* length of buffer */
struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
/* for rmb only: memory region
@@ -133,6 +141,12 @@ struct smc_rtoken { /* address/key of remote RMB */
};
#define SMC_LGR_ID_SIZE 4
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
struct smc_link_group {
struct list_head list;
@@ -158,7 +172,8 @@ struct smc_link_group {
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
- bool sync_err; /* lgr no longer fits to peer */
+ u8 sync_err : 1; /* lgr no longer fits to peer */
+ u8 terminating : 1;/* lgr is terminating */
};
/* Find the connection associated with the given alert token in the link group.
@@ -196,11 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn(
struct smc_sock;
struct smc_clc_msg_accept_confirm;
+struct smc_clc_msg_local;
void smc_lgr_free(struct smc_link_group *lgr);
void smc_lgr_forget(struct smc_link_group *lgr);
void smc_lgr_terminate(struct smc_link_group *lgr);
+void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
int smc_buf_create(struct smc_sock *smc);
+int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_clc_msg_accept_confirm *clc);
int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
@@ -209,4 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact);
+void smc_core_exit(void);
#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 427b91c1c964..839354402215 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
- r->diag_family = sk->sk_family;
if (!smc->clcsock)
return;
r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
r->id.idiag_dport = smc->clcsock->sk->sk_dport;
r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
sock_diag_save_cookie(sk, r->id.idiag_cookie);
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+ if (sk->sk_protocol == SMCPROTO_SMC) {
+ r->diag_family = PF_INET;
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_protocol == SMCPROTO_SMC6) {
+ r->diag_family = PF_INET6;
+ memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
+ sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
+ memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
+ sizeof(smc->clcsock->sk->sk_v6_daddr));
+#endif
+ }
}
static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
@@ -91,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
struct smc_connection *conn = &smc->conn;
struct smc_diag_conninfo cinfo = {
.token = conn->alert_token_local,
- .sndbuf_size = conn->sndbuf_size,
- .rmbe_size = conn->rmbe_size,
+ .sndbuf_size = conn->sndbuf_desc ?
+ conn->sndbuf_desc->len : 0,
+ .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
.peer_rmbe_size = conn->peer_rmbe_size,
.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
@@ -153,7 +164,8 @@ errout:
return -EMSGSIZE;
}
-static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct nlattr *bc = NULL;
@@ -161,8 +173,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct sock *sk;
int rc = 0;
- read_lock(&smc_proto.h.smc_hash->lock);
- head = &smc_proto.h.smc_hash->ht;
+ read_lock(&prot->h.smc_hash->lock);
+ head = &prot->h.smc_hash->ht;
if (hlist_empty(head))
goto out;
@@ -175,7 +187,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
}
out:
- read_unlock(&smc_proto.h.smc_hash->lock);
+ read_unlock(&prot->h.smc_hash->lock);
+ return rc;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int rc = 0;
+
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+ if (!rc)
+ rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
return rc;
}
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 26df554f7588..0eed7ab9f28b 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -143,17 +143,6 @@ out:
return rc;
}
-static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
-{
- struct smc_link_group *lgr, *l;
-
- list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
- if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
- lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
- smc_lgr_terminate(lgr);
- }
-}
-
/* process context wrapper for might_sleep smc_ib_remember_port_attr */
static void smc_ib_port_event_work(struct work_struct *work)
{
@@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
smc_ib_remember_port_attr(smcibdev, port_idx + 1);
clear_bit(port_idx, &smcibdev->port_event_mask);
if (!smc_ib_port_active(smcibdev, port_idx + 1))
- smc_ib_port_terminate(smcibdev, port_idx + 1);
+ smc_port_terminate(smcibdev, port_idx + 1);
}
}
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ea4b21981b4b..5800a6b43d83 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -214,12 +214,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
return rc;
}
-/* send ADD LINK request or response */
-int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
- union ib_gid *gid,
- enum smc_llc_reqresp reqresp)
+/* send LLC confirm rkey request */
+static int smc_llc_send_confirm_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
{
- struct smc_llc_msg_add_link *addllc;
+ struct smc_llc_msg_confirm_rkey *rkeyllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
@@ -227,7 +226,25 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
return rc;
- addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
+ rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+ rkeyllc->rtoken[0].rmb_key =
+ htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
+ (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+ return rc;
+}
+
+/* prepare an add link message */
+static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
+ struct smc_link *link, u8 mac[],
+ union ib_gid *gid,
+ enum smc_llc_reqresp reqresp)
+{
memset(addllc, 0, sizeof(*addllc));
addllc->hd.common.type = SMC_LLC_ADD_LINK;
addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
@@ -239,16 +256,14 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
}
memcpy(addllc->sender_mac, mac, ETH_ALEN);
memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
- /* send llc message */
- rc = smc_wr_tx_send(link, pend);
- return rc;
}
-/* send DELETE LINK request or response */
-int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp)
+/* send ADD LINK request or response */
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
+ union ib_gid *gid,
+ enum smc_llc_reqresp reqresp)
{
- struct smc_llc_msg_del_link *delllc;
+ struct smc_llc_msg_add_link *addllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
@@ -256,7 +271,18 @@ int smc_llc_send_delete_link(struct smc_link *link,
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
return rc;
- delllc = (struct smc_llc_msg_del_link *)wr_buf;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ smc_llc_prep_add_link(addllc, link, mac, gid, reqresp);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+ return rc;
+}
+
+/* prepare a delete link message */
+static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
+ struct smc_link *link,
+ enum smc_llc_reqresp reqresp)
+{
memset(delllc, 0, sizeof(*delllc));
delllc->hd.common.type = SMC_LLC_DELETE_LINK;
delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
@@ -266,14 +292,29 @@ int smc_llc_send_delete_link(struct smc_link *link,
delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
delllc->link_num = link->link_id;
+}
+
+/* send DELETE LINK request or response */
+int smc_llc_send_delete_link(struct smc_link *link,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_llc_msg_del_link *delllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ return rc;
+ delllc = (struct smc_llc_msg_del_link *)wr_buf;
+ smc_llc_prep_delete_link(delllc, link, reqresp);
/* send llc message */
rc = smc_wr_tx_send(link, pend);
return rc;
}
-/* send LLC test link request or response */
-int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
- enum smc_llc_reqresp reqresp)
+/* send LLC test link request */
+static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
{
struct smc_llc_msg_test_link *testllc;
struct smc_wr_tx_pend_priv *pend;
@@ -287,28 +328,52 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
memset(testllc, 0, sizeof(*testllc));
testllc->hd.common.type = SMC_LLC_TEST_LINK;
testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
- if (reqresp == SMC_LLC_RESP)
- testllc->hd.flags |= SMC_LLC_FLAG_RESP;
memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
/* send llc message */
rc = smc_wr_tx_send(link, pend);
return rc;
}
-/* send a prepared message */
-static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+struct smc_llc_send_work {
+ struct work_struct work;
+ struct smc_link *link;
+ int llclen;
+ union smc_llc_msg llcbuf;
+};
+
+/* worker that sends a prepared message */
+static void smc_llc_send_message_work(struct work_struct *work)
{
+ struct smc_llc_send_work *llcwrk = container_of(work,
+ struct smc_llc_send_work, work);
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
- rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (llcwrk->link->state == SMC_LNK_INACTIVE)
+ goto out;
+ rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
if (rc)
- return rc;
- memcpy(wr_buf, llcbuf, llclen);
- /* send llc message */
- rc = smc_wr_tx_send(link, pend);
- return rc;
+ goto out;
+ memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen);
+ smc_wr_tx_send(llcwrk->link, pend);
+out:
+ kfree(llcwrk);
+}
+
+/* copy llcbuf and schedule an llc send on link */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+{
+ struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+
+ if (!wrk)
+ return -ENOMEM;
+ INIT_WORK(&wrk->work, smc_llc_send_message_work);
+ wrk->link = link;
+ wrk->llclen = llclen;
+ memcpy(&wrk->llcbuf, llcbuf, llclen);
+ queue_work(link->llc_wq, &wrk->work);
+ return 0;
}
/********************************* receive ***********************************/
@@ -359,17 +424,18 @@ static void smc_llc_rx_add_link(struct smc_link *link,
}
if (lgr->role == SMC_SERV) {
- smc_llc_send_add_link(link,
+ smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
&link->smcibdev->gid[link->ibport - 1],
SMC_LLC_REQ);
} else {
- smc_llc_send_add_link(link,
+ smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
&link->smcibdev->gid[link->ibport - 1],
SMC_LLC_RESP);
}
+ smc_llc_send_message(link, llc, sizeof(*llc));
}
}
@@ -385,9 +451,11 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
} else {
if (lgr->role == SMC_SERV) {
smc_lgr_forget(lgr);
- smc_llc_send_delete_link(link, SMC_LLC_REQ);
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ);
+ smc_llc_send_message(link, llc, sizeof(*llc));
} else {
- smc_llc_send_delete_link(link, SMC_LLC_RESP);
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP);
+ smc_llc_send_message(link, llc, sizeof(*llc));
smc_lgr_terminate(lgr);
}
}
@@ -397,9 +465,11 @@ static void smc_llc_rx_test_link(struct smc_link *link,
struct smc_llc_msg_test_link *llc)
{
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- /* unused as long as we don't send this type of msg */
+ if (link->state == SMC_LNK_ACTIVE)
+ complete(&link->llc_testlink_resp);
} else {
- smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP);
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, llc, sizeof(*llc));
}
}
@@ -412,7 +482,9 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- /* unused as long as we don't send this type of msg */
+ link->llc_confirm_rkey_rc = llc->hd.flags &
+ SMC_LLC_FLAG_RKEY_NEG;
+ complete(&link->llc_confirm_rkey);
} else {
rc = smc_rtoken_add(lgr,
llc->rtoken[0].rmb_vaddr,
@@ -423,7 +495,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
llc->hd.flags |= SMC_LLC_FLAG_RESP;
if (rc < 0)
llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
- smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+ smc_llc_send_message(link, llc, sizeof(*llc));
}
}
@@ -435,7 +507,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
} else {
/* ignore rtokens for other links, we have only one link */
llc->hd.flags |= SMC_LLC_FLAG_RESP;
- smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+ smc_llc_send_message(link, llc, sizeof(*llc));
}
}
@@ -463,7 +535,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
}
llc->hd.flags |= SMC_LLC_FLAG_RESP;
- smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+ smc_llc_send_message(link, llc, sizeof(*llc));
}
}
@@ -476,6 +548,8 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
return; /* short message */
if (llc->raw.hdr.length != sizeof(*llc))
return; /* invalid message */
+ if (link->state == SMC_LNK_INACTIVE)
+ return; /* link not active, drop msg */
switch (llc->raw.hdr.common.type) {
case SMC_LLC_TEST_LINK:
@@ -502,6 +576,100 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
}
}
+/***************************** worker, utils *********************************/
+
+static void smc_llc_testlink_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(to_delayed_work(work),
+ struct smc_link, llc_testlink_wrk);
+ unsigned long next_interval;
+ struct smc_link_group *lgr;
+ unsigned long expire_time;
+ u8 user_data[16] = { 0 };
+ int rc;
+
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ if (link->state != SMC_LNK_ACTIVE)
+ return; /* don't reschedule worker */
+ expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
+ if (time_is_after_jiffies(expire_time)) {
+ next_interval = expire_time - jiffies;
+ goto out;
+ }
+ reinit_completion(&link->llc_testlink_resp);
+ smc_llc_send_test_link(link, user_data);
+ /* receive TEST LINK response over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
+ SMC_LLC_WAIT_TIME);
+ if (rc <= 0) {
+ smc_lgr_terminate(lgr);
+ return;
+ }
+ next_interval = link->llc_testlink_time;
+out:
+ queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
+ next_interval);
+}
+
+int smc_llc_link_init(struct smc_link *link)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
+ *((u32 *)lgr->id),
+ link->link_id);
+ if (!link->llc_wq)
+ return -ENOMEM;
+ init_completion(&link->llc_confirm);
+ init_completion(&link->llc_confirm_resp);
+ init_completion(&link->llc_add);
+ init_completion(&link->llc_add_resp);
+ init_completion(&link->llc_confirm_rkey);
+ init_completion(&link->llc_testlink_resp);
+ INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+ return 0;
+}
+
+void smc_llc_link_active(struct smc_link *link, int testlink_time)
+{
+ link->state = SMC_LNK_ACTIVE;
+ if (testlink_time) {
+ link->llc_testlink_time = testlink_time * HZ;
+ queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
+ link->llc_testlink_time);
+ }
+}
+
+/* called in tasklet context */
+void smc_llc_link_inactive(struct smc_link *link)
+{
+ link->state = SMC_LNK_INACTIVE;
+ cancel_delayed_work(&link->llc_testlink_wrk);
+}
+
+/* called in worker context */
+void smc_llc_link_clear(struct smc_link *link)
+{
+ flush_workqueue(link->llc_wq);
+ destroy_workqueue(link->llc_wq);
+}
+
+/* register a new rtoken at the remote peer */
+int smc_llc_do_confirm_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ int rc;
+
+ reinit_completion(&link->llc_confirm_rkey);
+ smc_llc_send_confirm_rkey(link, rmb_desc);
+ /* receive CONFIRM RKEY response from server over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
+ SMC_LLC_WAIT_TIME);
+ if (rc <= 0 || link->llc_confirm_rkey_rc)
+ return -EFAULT;
+ return 0;
+}
+
/***************************** init, exit, misc ******************************/
static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index e4a7d5e234d5..65c8645e96a1 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -42,8 +42,12 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
enum smc_llc_reqresp reqresp);
int smc_llc_send_delete_link(struct smc_link *link,
enum smc_llc_reqresp reqresp);
-int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
- enum smc_llc_reqresp reqresp);
+int smc_llc_link_init(struct smc_link *link);
+void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_inactive(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link);
+int smc_llc_do_confirm_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc);
int smc_llc_init(void) __init;
#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index eff4e0d0bb31..3d77b383cccd 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -22,11 +22,10 @@
#include "smc_tx.h" /* smc_tx_consumer_update() */
#include "smc_rx.h"
-/* callback implementation for sk.sk_data_ready()
- * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+/* callback implementation to wakeup consumers blocked with smc_rx_wait().
* indirectly called by smc_cdc_msg_recv_action().
*/
-static void smc_rx_data_ready(struct sock *sk)
+static void smc_rx_wake_up(struct sock *sk)
{
struct socket_wq *wq;
@@ -44,28 +43,180 @@ static void smc_rx_data_ready(struct sock *sk)
rcu_read_unlock();
}
+/* Update consumer cursor
+ * @conn connection to update
+ * @cons consumer cursor
+ * @len number of Bytes consumed
+ * Returns:
+ * 1 if we should end our receive, 0 otherwise
+ */
+static int smc_rx_update_consumer(struct smc_sock *smc,
+ union smc_host_cursor cons, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ bool force = false;
+ int diff, rc = 0;
+
+ smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+ /* did we process urgent data? */
+ if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+ diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+ &conn->urg_curs);
+ if (sock_flag(sk, SOCK_URGINLINE)) {
+ if (diff == 0) {
+ force = true;
+ rc = 1;
+ conn->urg_state = SMC_URG_READ;
+ }
+ } else {
+ if (diff == 1) {
+ /* skip urgent byte */
+ force = true;
+ smc_curs_add(conn->rmb_desc->len, &cons, 1);
+ conn->urg_rx_skip_pend = false;
+ } else if (diff < -1)
+ /* we read past urgent byte */
+ conn->urg_state = SMC_URG_READ;
+ }
+ }
+
+ smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
+ conn);
+
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn, force);
+
+ return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+
+ smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ smc_rx_update_consumer(smc, cons, len);
+}
+
+struct smc_spd_priv {
+ struct smc_sock *smc;
+ size_t len;
+};
+
+static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
+ struct smc_sock *smc = priv->smc;
+ struct smc_connection *conn;
+ struct sock *sk = &smc->sk;
+
+ if (sk->sk_state == SMC_CLOSED ||
+ sk->sk_state == SMC_PEERFINCLOSEWAIT ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ goto out;
+ conn = &smc->conn;
+ lock_sock(sk);
+ smc_rx_update_cons(smc, priv->len);
+ release_sock(sk);
+ if (atomic_sub_and_test(priv->len, &conn->splice_pending))
+ smc_rx_wake_up(sk);
+out:
+ kfree(priv);
+ put_page(buf->page);
+ sock_put(sk);
+}
+
+static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+static const struct pipe_buf_operations smc_pipe_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = smc_rx_pipe_buf_release,
+ .steal = smc_rx_pipe_buf_nosteal,
+ .get = generic_pipe_buf_get
+};
+
+static void smc_rx_spd_release(struct splice_pipe_desc *spd,
+ unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
+ struct smc_sock *smc)
+{
+ struct splice_pipe_desc spd;
+ struct partial_page partial;
+ struct smc_spd_priv *priv;
+ struct page *page;
+ int bytes;
+
+ page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ priv->len = len;
+ priv->smc = smc;
+ partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+ partial.len = len;
+ partial.private = (unsigned long)priv;
+
+ spd.nr_pages_max = 1;
+ spd.nr_pages = 1;
+ spd.pages = &page;
+ spd.partial = &partial;
+ spd.ops = &smc_pipe_ops;
+ spd.spd_release = smc_rx_spd_release;
+
+ bytes = splice_to_pipe(pipe, &spd);
+ if (bytes > 0) {
+ sock_hold(&smc->sk);
+ get_page(smc->conn.rmb_desc->pages);
+ atomic_add(bytes, &smc->conn.splice_pending);
+ }
+
+ return bytes;
+}
+
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv) &&
+ !atomic_read(&conn->splice_pending);
+}
+
/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
* @smc smc socket
* @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * @fcrit add'l criterion to evaluate as function pointer
* Returns:
* 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
* 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
*/
-static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn))
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
struct sock *sk = &smc->sk;
int rc;
- if (atomic_read(&conn->bytes_to_rcv))
+ if (fcrit(conn))
return 1;
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
add_wait_queue(sk_sleep(sk), &wait);
rc = sk_wait_event(sk, timeo,
sk->sk_err ||
sk->sk_shutdown & RCV_SHUTDOWN ||
- atomic_read(&conn->bytes_to_rcv) ||
+ fcrit(conn) ||
smc_cdc_rxed_any_close_or_senddone(conn),
&wait);
remove_wait_queue(sk_sleep(sk), &wait);
@@ -73,65 +224,115 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
return rc;
}
-/* rcvbuf consumer: main API called by socket layer.
- * called under sk lock.
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+ int flags)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_URGINLINE) ||
+ !(conn->urg_state == SMC_URG_VALID) ||
+ conn->urg_state == SMC_URG_READ)
+ return -EINVAL;
+
+ if (conn->urg_state == SMC_URG_VALID) {
+ if (!(flags & MSG_PEEK))
+ smc->conn.urg_state = SMC_URG_READ;
+ msg->msg_flags |= MSG_OOB;
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+ len = 1;
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons,
+ conn),
+ conn);
+ if (smc_curs_diff(conn->rmb_desc->len, &cons,
+ &conn->urg_curs) > 1)
+ conn->urg_rx_skip_pend = true;
+ /* Urgent Byte was already accounted for, but trigger
+ * skipping the urgent byte in non-inline case
+ */
+ if (!(flags & MSG_PEEK))
+ smc_rx_update_consumer(smc, cons, 0);
+ } else {
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ return rc ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+
+ return -EAGAIN;
+}
+
+/* smc_rx_recvmsg - receive data from RMBE
+ * @msg: copy data to receive buffer
+ * @pipe: copy data to pipe if set - indicates splice() call
+ *
+ * rcvbuf consumer: main API called by socket layer.
+ * Called under sk lock.
*/
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
- int flags)
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags)
{
size_t copylen, read_done = 0, read_remaining = len;
size_t chunk_len, chunk_off, chunk_len_sum;
struct smc_connection *conn = &smc->conn;
+ int (*func)(struct smc_connection *conn);
union smc_host_cursor cons;
int readable, chunk;
char *rcvbuf_base;
struct sock *sk;
+ int splbytes;
long timeo;
int target; /* Read at least these many bytes */
int rc;
if (unlikely(flags & MSG_ERRQUEUE))
return -EINVAL; /* future work for sk.sk_family == AF_SMC */
- if (flags & MSG_OOB)
- return -EINVAL; /* future work */
sk = &smc->sk;
if (sk->sk_state == SMC_LISTEN)
return -ENOTCONN;
+ if (flags & MSG_OOB)
+ return smc_rx_recv_urg(smc, msg, len, flags);
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- msg->msg_namelen = 0;
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
rcvbuf_base = conn->rmb_desc->cpu_addr;
do { /* while (read_remaining) */
- if (read_done >= target)
+ if (read_done >= target || (pipe && read_done))
break;
if (atomic_read(&conn->bytes_to_rcv))
goto copy;
+ else if (conn->urg_state == SMC_URG_VALID)
+ /* we received a single urgent Byte - skip */
+ smc_rx_update_cons(smc, 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ break;
if (read_done) {
if (sk->sk_err ||
sk->sk_state == SMC_CLOSED ||
- sk->sk_shutdown & RCV_SHUTDOWN ||
!timeo ||
- signal_pending(current) ||
- smc_cdc_rxed_any_close_or_senddone(conn) ||
- conn->local_tx_ctrl.conn_state_flags.
- peer_conn_abort)
+ signal_pending(current))
break;
} else {
if (sk->sk_err) {
read_done = sock_error(sk);
break;
}
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- smc_cdc_rxed_any_close_or_senddone(conn) ||
- conn->local_tx_ctrl.conn_state_flags.
- peer_conn_abort)
- break;
if (sk->sk_state == SMC_CLOSED) {
if (!sock_flag(sk, SOCK_DONE)) {
/* This occurs when user tries to read
@@ -150,32 +351,56 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
return -EAGAIN;
}
- if (!atomic_read(&conn->bytes_to_rcv)) {
- smc_rx_wait_data(smc, &timeo);
+ if (!smc_rx_data_available(conn)) {
+ smc_rx_wait(smc, &timeo, smc_rx_data_available);
continue;
}
copy:
/* initialize variables for 1st iteration of subsequent loop */
- /* could be just 1 byte, even after smc_rx_wait_data above */
+ /* could be just 1 byte, even after waiting on data above */
readable = atomic_read(&conn->bytes_to_rcv);
- /* not more than what user space asked for */
- copylen = min_t(size_t, read_remaining, readable);
+ splbytes = atomic_read(&conn->splice_pending);
+ if (!readable || (msg && splbytes)) {
+ if (splbytes)
+ func = smc_rx_data_available_and_no_splice_pend;
+ else
+ func = smc_rx_data_available;
+ smc_rx_wait(smc, &timeo, func);
+ continue;
+ }
+
smc_curs_write(&cons,
smc_curs_read(&conn->local_tx_ctrl.cons, conn),
conn);
+ /* subsequent splice() calls pick up where previous left */
+ if (splbytes)
+ smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+ if (conn->urg_state == SMC_URG_VALID &&
+ sock_flag(&smc->sk, SOCK_URGINLINE) &&
+ readable > 1)
+ readable--; /* always stop at urgent Byte */
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, read_remaining, readable);
/* determine chunks where to read from rcvbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
- chunk_len = min_t(size_t,
- copylen, conn->rmbe_size - cons.count);
+ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
+ cons.count);
chunk_len_sum = chunk_len;
chunk_off = cons.count;
smc_rmb_sync_sg_for_cpu(conn);
for (chunk = 0; chunk < 2; chunk++) {
if (!(flags & MSG_TRUNC)) {
- rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
- chunk_len);
- if (rc) {
+ if (msg) {
+ rc = memcpy_to_msg(msg, rcvbuf_base +
+ chunk_off,
+ chunk_len);
+ } else {
+ rc = smc_rx_splice(pipe, rcvbuf_base +
+ chunk_off, chunk_len,
+ smc);
+ }
+ if (rc < 0) {
if (!read_done)
read_done = -EFAULT;
smc_rmb_sync_sg_for_device(conn);
@@ -196,18 +421,13 @@ copy:
/* update cursors */
if (!(flags & MSG_PEEK)) {
- smc_curs_add(conn->rmbe_size, &cons, copylen);
/* increased in recv tasklet smc_cdc_msg_rcv() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->bytes_to_rcv);
- /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
smp_mb__after_atomic();
- smc_curs_write(&conn->local_tx_ctrl.cons,
- smc_curs_read(&cons, conn),
- conn);
- /* send consumer cursor update if required */
- /* similar to advertising new TCP rcv_wnd if required */
- smc_tx_consumer_update(conn);
+ if (msg && smc_rx_update_consumer(smc, cons, copylen))
+ goto out;
}
} while (read_remaining);
out:
@@ -217,5 +437,7 @@ out:
/* Initialize receive properties on connection establishment. NB: not __init! */
void smc_rx_init(struct smc_sock *smc)
{
- smc->sk.sk_data_ready = smc_rx_data_ready;
+ smc->sk.sk_data_ready = smc_rx_wake_up;
+ atomic_set(&smc->conn.splice_pending, 0);
+ smc->conn.urg_state = SMC_URG_READ;
}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 3a32b59bf06c..db823c97d824 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -18,7 +18,14 @@
#include "smc.h"
void smc_rx_init(struct smc_sock *smc);
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
- int flags);
+
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags);
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn));
+static inline int smc_rx_data_available(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv);
+}
#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 72f004c9c9b1..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -19,6 +19,7 @@
#include <linux/sched/signal.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include "smc.h"
#include "smc_wr.h"
@@ -26,11 +27,12 @@
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
+#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
/* callback implementation for sk.sk_write_space()
- * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
* called under sk_socket lock.
*/
static void smc_tx_write_space(struct sock *sk)
@@ -54,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
}
}
-/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
* Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
*/
void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -64,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
smc->sk.sk_write_space(&smc->sk);
}
-/* blocks sndbuf producer until at least one byte of free space available */
-static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
@@ -101,20 +105,28 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
break;
}
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- if (atomic_read(&conn->sndbuf_space))
- break; /* at least 1 byte of free space available */
+ if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+ break; /* at least 1 byte of free & no urgent data */
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk_wait_event(sk, &timeo,
sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
smc_cdc_rxed_any_close(conn) ||
- atomic_read(&conn->sndbuf_space),
+ (atomic_read(&conn->sndbuf_space) &&
+ !conn->urg_tx_pend),
&wait);
}
remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+ struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+ return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
@@ -148,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
if (smc_cdc_rxed_any_close(conn))
return send_done ?: -ECONNRESET;
- if (!atomic_read(&conn->sndbuf_space)) {
- rc = smc_tx_wait_memory(smc, msg->msg_flags);
+ if (msg->msg_flags & MSG_OOB)
+ conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+ if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+ rc = smc_tx_wait(smc, msg->msg_flags);
if (rc) {
if (send_done)
return send_done;
@@ -159,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
}
/* initialize variables for 1st iteration of subsequent loop */
- /* could be just 1 byte, even after smc_tx_wait_memory above */
+ /* could be just 1 byte, even after smc_tx_wait above */
writespace = atomic_read(&conn->sndbuf_space);
/* not more than what user space asked for */
copylen = min_t(size_t, send_remaining, writespace);
@@ -171,8 +186,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
tx_cnt_prep = prep.count;
/* determine chunks where to write into sndbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
- chunk_len = min_t(size_t,
- copylen, conn->sndbuf_size - tx_cnt_prep);
+ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
+ tx_cnt_prep);
chunk_len_sum = chunk_len;
chunk_off = tx_cnt_prep;
smc_sndbuf_sync_sg_for_cpu(conn);
@@ -197,19 +212,30 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
}
smc_sndbuf_sync_sg_for_device(conn);
/* update cursors */
- smc_curs_add(conn->sndbuf_size, &prep, copylen);
+ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
smc_curs_write(&conn->tx_curs_prep,
smc_curs_read(&prep, conn),
conn);
/* increased in send tasklet smc_cdc_tx_handler() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->sndbuf_space);
- /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
/* since we just produced more new data into sndbuf,
* trigger sndbuf consumer: RDMA write into peer RMBE and CDC
*/
- smc_tx_sndbuf_nonempty(conn);
+ if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+ conn->urg_tx_pend = true;
+ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
+ (atomic_read(&conn->sndbuf_space) >
+ (conn->sndbuf_desc->len >> 1)))
+ /* for a corked socket defer the RDMA writes if there
+ * is still sufficient sndbuf_space available
+ */
+ schedule_delayed_work(&conn->tx_work,
+ SMC_TX_CORK_DELAY);
+ else
+ smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
return send_done;
@@ -243,7 +269,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
rdma_wr.remote_addr =
lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
/* RMBE within RMB */
- ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+ conn->tx_off +
/* offset within RMBE */
peer_rmbe_offset;
rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
@@ -268,7 +294,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
atomic_sub(len, &conn->peer_rmbe_space);
/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
smp_mb__after_atomic();
- smc_curs_add(conn->sndbuf_size, sent, len);
+ smc_curs_add(conn->sndbuf_desc->len, sent, len);
}
/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
@@ -281,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
union smc_host_cursor sent, prep, prod, cons;
struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
struct smc_link_group *lgr = conn->lgr;
+ struct smc_cdc_producer_flags *pflags;
int to_send, rmbespace;
struct smc_link *link;
dma_addr_t dma_addr;
@@ -291,7 +318,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
/* cf. wmem_alloc - (snd_max - snd_una) */
- to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
if (to_send <= 0)
return 0;
@@ -308,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
conn);
/* if usable snd_wnd closes ask peer to advertise once it opens again */
- conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ pflags->write_blocked = (to_send >= rmbespace);
/* cf. usable snd_wnd */
len = min(to_send, rmbespace);
@@ -333,12 +361,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
dst_len_sum = dst_len;
src_off = sent.count;
/* dst_len determines the maximum src_len */
- if (sent.count + dst_len <= conn->sndbuf_size) {
+ if (sent.count + dst_len <= conn->sndbuf_desc->len) {
/* unwrapped src case: single chunk of entire dst_len */
src_len = dst_len;
} else {
/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
- src_len = conn->sndbuf_size - sent.count;
+ src_len = conn->sndbuf_desc->len - sent.count;
}
src_len_sum = src_len;
dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
@@ -350,8 +378,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
num_sges++;
src_off += src_len;
- if (src_off >= conn->sndbuf_size)
- src_off -= conn->sndbuf_size;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
/* modulo in send ring */
if (src_len_sum == dst_len)
break; /* either on 1st or 2nd iteration */
@@ -369,10 +397,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
dst_len = len - dst_len; /* remainder */
dst_len_sum += dst_len;
src_len = min_t(int,
- dst_len, conn->sndbuf_size - sent.count);
+ dst_len, conn->sndbuf_desc->len - sent.count);
src_len_sum = src_len;
}
+ if (conn->urg_tx_pend && len == to_send)
+ pflags->urg_data_present = 1;
smc_tx_advance_cursors(conn, &prod, &sent, len);
/* update connection's cursors with advanced local cursors */
smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -392,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
*/
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
{
+ struct smc_cdc_producer_flags *pflags;
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
int rc;
@@ -409,20 +440,27 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
}
rc = 0;
if (conn->alert_token_local) /* connection healthy */
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ mod_delayed_work(system_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
}
goto out_unlock;
}
- rc = smc_tx_rdma_writes(conn);
- if (rc) {
- smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
- (struct smc_wr_tx_pend_priv *)pend);
- goto out_unlock;
+ if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
+ rc = smc_tx_rdma_writes(conn);
+ if (rc) {
+ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
}
rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
out_unlock:
spin_unlock_bh(&conn->send_lock);
@@ -432,7 +470,7 @@ out_unlock:
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
-static void smc_tx_work(struct work_struct *work)
+void smc_tx_work(struct work_struct *work)
{
struct smc_connection *conn = container_of(to_delayed_work(work),
struct smc_connection,
@@ -455,7 +493,7 @@ out:
release_sock(&smc->sk);
}
-void smc_tx_consumer_update(struct smc_connection *conn)
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
{
union smc_host_cursor cfed, cons;
int to_confirm;
@@ -466,11 +504,12 @@ void smc_tx_consumer_update(struct smc_connection *conn)
smc_curs_write(&cfed,
smc_curs_read(&conn->rx_curs_confirmed, conn),
conn);
- to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+ to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ force ||
((to_confirm > conn->rmbe_update_limit) &&
- ((to_confirm > (conn->rmbe_size / 2)) ||
+ ((to_confirm > (conn->rmb_desc->len / 2)) ||
conn->local_rx_ctrl.prod_flags.write_blocked))) {
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
conn->alert_token_local) { /* connection healthy */
@@ -494,6 +533,4 @@ void smc_tx_consumer_update(struct smc_connection *conn)
void smc_tx_init(struct smc_sock *smc)
{
smc->sk.sk_write_space = smc_tx_write_space;
- INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
- spin_lock_init(&smc->conn.send_lock);
}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 78255964fa4d..9d2238909fa0 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -24,13 +24,14 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
- return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
+void smc_tx_work(struct work_struct *work);
void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
-void smc_tx_consumer_update(struct smc_connection *conn);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1b8af23e6e2b..cc7c1bb60fe8 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
for (i = 0; i < num; i++) {
link = wc[i].qp->qp_context;
if (wc[i].status == IB_WC_SUCCESS) {
+ link->wr_rx_tstamp = jiffies;
smc_wr_rx_demultiplex(&wc[i]);
smc_wr_rx_post(link); /* refill WR RX */
} else {
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 092bebc70048..1a9695183599 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -512,6 +512,19 @@ int strp_init(struct strparser *strp, struct sock *sk,
}
EXPORT_SYMBOL_GPL(strp_init);
+/* Sock process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp)
+{
+ strp->paused = 0;
+
+ if (strp->need_bytes) {
+ if (strp_peek_len(strp) < strp->need_bytes)
+ return;
+ }
+ strp_read_sock(strp);
+}
+EXPORT_SYMBOL_GPL(__strp_unpause);
+
void strp_unpause(struct strparser *strp)
{
strp->paused = 0;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..2dfb492a7c94 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -697,6 +697,9 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
goto prop_msg_full;
+ if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP)
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu))
+ goto prop_msg_full;
nla_nest_end(msg->skb, prop);
@@ -979,12 +982,23 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
if (props[TIPC_NLA_PROP_TOL]) {
b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- tipc_node_apply_tolerance(net, b);
+ tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL);
}
if (props[TIPC_NLA_PROP_PRIO])
b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if (props[TIPC_NLA_PROP_MTU]) {
+ if (b->media->type_id != TIPC_MEDIA_TYPE_UDP)
+ return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (tipc_udp_mtu_bad(nla_get_u32
+ (props[TIPC_NLA_PROP_MTU])))
+ return -EINVAL;
+ b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+ tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU);
+#endif
+ }
}
return 0;
@@ -1029,6 +1043,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
goto prop_msg_full;
+ if (media->type_id == TIPC_MEDIA_TYPE_UDP)
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
+ goto prop_msg_full;
nla_nest_end(msg->skb, prop);
nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1175,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if (props[TIPC_NLA_PROP_MTU]) {
+ if (m->type_id != TIPC_MEDIA_TYPE_UDP)
+ return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ if (tipc_udp_mtu_bad(nla_get_u32
+ (props[TIPC_NLA_PROP_MTU])))
+ return -EINVAL;
+ m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+#endif
+ }
}
return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
* @priority: default link (and bearer) priority
* @tolerance: default time (in ms) before declaring link failure
* @window: default window (in packets) before declaring link congestion
+ * @mtu: max packet size bearer can support for media type not dependent on
+ * underlying device MTU
* @type_id: TIPC media identifier
* @hwaddr_len: TIPC media address len
* @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
u32 priority;
u32 tolerance;
u32 window;
+ u32 mtu;
u32 type_id;
u32 hwaddr_len;
char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index dd1c4fa2eb78..bebe88cae07b 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -136,12 +136,12 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
}
/**
- * tipc_service_find_range - find service range matching a service instance
+ * tipc_service_first_range - find first service range in tree matching instance
*
* Very time-critical, so binary search through range rb tree
*/
-static struct service_range *tipc_service_find_range(struct tipc_service *sc,
- u32 instance)
+static struct service_range *tipc_service_first_range(struct tipc_service *sc,
+ u32 instance)
{
struct rb_node *n = sc->ranges.rb_node;
struct service_range *sr;
@@ -158,6 +158,30 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc,
return NULL;
}
+/* tipc_service_find_range - find service range matching publication parameters
+ */
+static struct service_range *tipc_service_find_range(struct tipc_service *sc,
+ u32 lower, u32 upper)
+{
+ struct rb_node *n = sc->ranges.rb_node;
+ struct service_range *sr;
+
+ sr = tipc_service_first_range(sc, lower);
+ if (!sr)
+ return NULL;
+
+ /* Look for exact match */
+ for (n = &sr->tree_node; n; n = rb_next(n)) {
+ sr = container_of(n, struct service_range, tree_node);
+ if (sr->upper == upper)
+ break;
+ }
+ if (!n || sr->lower != lower || sr->upper != upper)
+ return NULL;
+
+ return sr;
+}
+
static struct service_range *tipc_service_create_range(struct tipc_service *sc,
u32 lower, u32 upper)
{
@@ -238,54 +262,19 @@ err:
/**
* tipc_service_remove_publ - remove a publication from a service
*/
-static struct publication *tipc_service_remove_publ(struct net *net,
- struct tipc_service *sc,
- u32 lower, u32 upper,
- u32 node, u32 key,
- struct service_range **rng)
+static struct publication *tipc_service_remove_publ(struct service_range *sr,
+ u32 node, u32 key)
{
- struct tipc_subscription *sub, *tmp;
- struct service_range *sr;
struct publication *p;
- bool found = false;
- bool last = false;
- struct rb_node *n;
-
- sr = tipc_service_find_range(sc, lower);
- if (!sr)
- return NULL;
- /* Find exact matching service range */
- for (n = &sr->tree_node; n; n = rb_next(n)) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->upper == upper)
- break;
- }
- if (!n || sr->lower != lower || sr->upper != upper)
- return NULL;
-
- /* Find publication, if it exists */
list_for_each_entry(p, &sr->all_publ, all_publ) {
if (p->key != key || (node && node != p->node))
continue;
- found = true;
- break;
+ list_del(&p->all_publ);
+ list_del(&p->local_publ);
+ return p;
}
- if (!found)
- return NULL;
-
- list_del(&p->all_publ);
- list_del(&p->local_publ);
- if (list_empty(&sr->all_publ))
- last = true;
-
- /* Notify any waiting subscriptions */
- list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
- tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
- p->port, p->node, p->scope, last);
- }
- *rng = sr;
- return p;
+ return NULL;
}
/**
@@ -376,17 +365,31 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
u32 node, u32 key)
{
struct tipc_service *sc = tipc_service_find(net, type);
+ struct tipc_subscription *sub, *tmp;
struct service_range *sr = NULL;
struct publication *p = NULL;
+ bool last;
if (!sc)
return NULL;
spin_lock_bh(&sc->lock);
- p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+ sr = tipc_service_find_range(sc, lower, upper);
+ if (!sr)
+ goto exit;
+ p = tipc_service_remove_publ(sr, node, key);
+ if (!p)
+ goto exit;
+
+ /* Notify any waiting subscriptions */
+ last = list_empty(&sr->all_publ);
+ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
+ tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN,
+ p->port, node, p->scope, last);
+ }
/* Remove service range item if this was its last publication */
- if (sr && list_empty(&sr->all_publ)) {
+ if (list_empty(&sr->all_publ)) {
rb_erase(&sr->tree_node, &sc->ranges);
kfree(sr);
}
@@ -396,6 +399,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
hlist_del_init_rcu(&sc->service_list);
kfree_rcu(sc, rcu);
}
+exit:
spin_unlock_bh(&sc->lock);
return p;
}
@@ -437,7 +441,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
goto not_found;
spin_lock_bh(&sc->lock);
- sr = tipc_service_find_range(sc, instance);
+ sr = tipc_service_first_range(sc, instance);
if (unlikely(!sr))
goto no_match;
@@ -484,7 +488,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
spin_lock_bh(&sc->lock);
- sr = tipc_service_find_range(sc, instance);
+ sr = tipc_service_first_range(sc, instance);
if (!sr)
goto no_match;
@@ -756,8 +760,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
spin_lock_bh(&sc->lock);
rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
- tipc_service_remove_publ(net, sc, p->lower, p->upper,
- p->node, p->key, &sr);
+ tipc_service_remove_publ(sr, p->node, p->key);
kfree_rcu(p, rcu);
}
rb_erase(&sr->tree_node, &sc->ranges);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index f29549de9245..6a44eb812baf 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
return mtu;
}
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
+{
+ u8 *own_id = tipc_own_id(net);
+ struct tipc_node *n;
+
+ if (!own_id)
+ return true;
+
+ if (addr == tipc_own_addr(net)) {
+ memcpy(id, own_id, TIPC_NODEID_LEN);
+ return true;
+ }
+ n = tipc_node_find(net, addr);
+ if (!n)
+ return false;
+
+ memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
+ tipc_node_put(n);
+ return true;
+}
+
u16 tipc_node_get_capabilities(struct net *net, u32 addr)
{
struct tipc_node *n;
@@ -1681,7 +1702,8 @@ discard:
kfree_skb(skb);
}
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
+ int prop)
{
struct tipc_net *tn = tipc_net(net);
int bearer_id = b->identity;
@@ -1696,8 +1718,13 @@ void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
list_for_each_entry_rcu(n, &tn->node_list, list) {
tipc_node_write_lock(n);
e = &n->links[bearer_id];
- if (e->link)
- tipc_link_set_tolerance(e->link, b->tolerance, &xmitq);
+ if (e->link) {
+ if (prop == TIPC_NLA_PROP_TOL)
+ tipc_link_set_tolerance(e->link, b->tolerance,
+ &xmitq);
+ else if (prop == TIPC_NLA_PROP_MTU)
+ tipc_link_set_mtu(e->link, b->mtu);
+ }
tipc_node_write_unlock(n);
tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index f24b83500df1..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_bearer *bearer,
@@ -67,7 +68,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr);
void tipc_node_delete_links(struct net *net, int bearer_id);
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b);
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3bb45042e833..14a5d055717d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2970,7 +2970,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- struct sock *sk = sock->sk;
+ struct net *net = sock_net(sock->sk);
+ struct tipc_sioc_nodeid_req nr = {0};
struct tipc_sioc_ln_req lnr;
void __user *argp = (void __user *)arg;
@@ -2978,7 +2979,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCGETLINKNAME:
if (copy_from_user(&lnr, argp, sizeof(lnr)))
return -EFAULT;
- if (!tipc_node_get_linkname(sock_net(sk),
+ if (!tipc_node_get_linkname(net,
lnr.bearer_id & 0xffff, lnr.peer,
lnr.linkname, TIPC_MAX_LINK_NAME)) {
if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2986,6 +2987,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return 0;
}
return -EADDRNOTAVAIL;
+ case SIOCGETNODEID:
+ if (copy_from_user(&nr, argp, sizeof(nr)))
+ return -EFAULT;
+ if (!tipc_node_get_id(net, nr.peer, nr.node_id))
+ return -EADDRNOTAVAIL;
+ if (copy_to_user(argp, &nr, sizeof(nr)))
+ return -EFAULT;
+ return 0;
default:
return -ENOIOCTLCMD;
}
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
err = -EINVAL;
goto err;
}
- b->mtu = dev->mtu - sizeof(struct iphdr)
- - sizeof(struct udphdr);
+ b->mtu = b->media->mtu;
#if IS_ENABLED(CONFIG_IPV6)
} else if (local.proto == htons(ETH_P_IPV6)) {
udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
.window = TIPC_DEF_LINK_WIN,
+ .mtu = TIPC_DEF_LINK_UDP_MTU,
.type_id = TIPC_MEDIA_TYPE_UDP,
.hwaddr_len = 0,
.name = "udp"
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
#ifndef _TIPC_UDP_MEDIA_H
#define _TIPC_UDP_MEDIA_H
+#include <linux/ip.h>
+#include <linux/udp.h>
+
int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
+/* check if configured MTU is too low for tipc headers */
+static inline bool tipc_udp_mtu_bad(u32 mtu)
+{
+ if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ return false;
+
+ pr_warn("MTU too low for tipc bearer\n");
+ return true;
+}
+
#endif
#endif
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 89b8745a986f..73f05ece53d0 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -14,3 +14,13 @@ config TLS
encryption handling of the TLS protocol to be done in-kernel.
If unsure, say N.
+
+config TLS_DEVICE
+ bool "Transport Layer Security HW offload"
+ depends on TLS
+ select SOCK_VALIDATE_XMIT
+ default n
+ help
+ Enable kernel support for HW offload of the TLS protocol.
+
+ If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index a930fd1c4f7b..4d6b728a67d0 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,3 +5,5 @@
obj-$(CONFIG_TLS) += tls.o
tls-y := tls_main.o tls_sw.o
+
+tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
new file mode 100644
index 000000000000..a7a8f8e20ff3
--- /dev/null
+++ b/net/tls/tls_device.c
@@ -0,0 +1,766 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <crypto/aead.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+
+/* device_offload_lock is used to synchronize tls_dev_add
+ * against NETDEV_DOWN notifications.
+ */
+static DECLARE_RWSEM(device_offload_lock);
+
+static void tls_device_gc_task(struct work_struct *work);
+
+static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
+static LIST_HEAD(tls_device_gc_list);
+static LIST_HEAD(tls_device_list);
+static DEFINE_SPINLOCK(tls_device_lock);
+
+static void tls_device_free_ctx(struct tls_context *ctx)
+{
+ struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+
+ kfree(offload_ctx);
+ kfree(ctx);
+}
+
+static void tls_device_gc_task(struct work_struct *work)
+{
+ struct tls_context *ctx, *tmp;
+ unsigned long flags;
+ LIST_HEAD(gc_list);
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_splice_init(&tls_device_gc_list, &gc_list);
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+
+ list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
+ struct net_device *netdev = ctx->netdev;
+
+ if (netdev) {
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ dev_put(netdev);
+ }
+
+ list_del(&ctx->list);
+ tls_device_free_ctx(ctx);
+ }
+}
+
+static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_move_tail(&ctx->list, &tls_device_gc_list);
+
+ /* schedule_work inside the spinlock
+ * to make sure tls_device_down waits for that work.
+ */
+ schedule_work(&tls_device_gc_work);
+
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+}
+
+/* We assume that the socket is already connected */
+static struct net_device *get_netdev_for_sock(struct sock *sk)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ struct net_device *netdev = NULL;
+
+ if (likely(dst)) {
+ netdev = dst->dev;
+ dev_hold(netdev);
+ }
+
+ dst_release(dst);
+
+ return netdev;
+}
+
+static void destroy_record(struct tls_record_info *record)
+{
+ int nr_frags = record->num_frags;
+ skb_frag_t *frag;
+
+ while (nr_frags-- > 0) {
+ frag = &record->frags[nr_frags];
+ __skb_frag_unref(frag);
+ }
+ kfree(record);
+}
+
+static void delete_all_records(struct tls_offload_context *offload_ctx)
+{
+ struct tls_record_info *info, *temp;
+
+ list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
+ list_del(&info->list);
+ destroy_record(info);
+ }
+
+ offload_ctx->retransmit_hint = NULL;
+}
+
+static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_record_info *info, *temp;
+ struct tls_offload_context *ctx;
+ u64 deleted_records = 0;
+ unsigned long flags;
+
+ if (!tls_ctx)
+ return;
+
+ ctx = tls_offload_ctx(tls_ctx);
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ info = ctx->retransmit_hint;
+ if (info && !before(acked_seq, info->end_seq)) {
+ ctx->retransmit_hint = NULL;
+ list_del(&info->list);
+ destroy_record(info);
+ deleted_records++;
+ }
+
+ list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
+ if (before(acked_seq, info->end_seq))
+ break;
+ list_del(&info->list);
+
+ destroy_record(info);
+ deleted_records++;
+ }
+
+ ctx->unacked_record_sn += deleted_records;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/* At this point, there should be no references on this
+ * socket and no in-flight SKBs associated with this
+ * socket, so it is safe to free all the resources.
+ */
+void tls_device_sk_destruct(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+
+ if (ctx->open_record)
+ destroy_record(ctx->open_record);
+
+ delete_all_records(ctx);
+ crypto_free_aead(ctx->aead_send);
+ ctx->sk_destruct(sk);
+ clean_acked_data_disable(inet_csk(sk));
+
+ if (refcount_dec_and_test(&tls_ctx->refcount))
+ tls_device_queue_ctx_destruction(tls_ctx);
+}
+EXPORT_SYMBOL(tls_device_sk_destruct);
+
+static void tls_append_frag(struct tls_record_info *record,
+ struct page_frag *pfrag,
+ int size)
+{
+ skb_frag_t *frag;
+
+ frag = &record->frags[record->num_frags - 1];
+ if (frag->page.p == pfrag->page &&
+ frag->page_offset + frag->size == pfrag->offset) {
+ frag->size += size;
+ } else {
+ ++frag;
+ frag->page.p = pfrag->page;
+ frag->page_offset = pfrag->offset;
+ frag->size = size;
+ ++record->num_frags;
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += size;
+ record->len += size;
+}
+
+static int tls_push_record(struct sock *sk,
+ struct tls_context *ctx,
+ struct tls_offload_context *offload_ctx,
+ struct tls_record_info *record,
+ struct page_frag *pfrag,
+ int flags,
+ unsigned char record_type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct page_frag dummy_tag_frag;
+ skb_frag_t *frag;
+ int i;
+
+ /* fill prepend */
+ frag = &record->frags[0];
+ tls_fill_prepend(ctx,
+ skb_frag_address(frag),
+ record->len - ctx->tx.prepend_size,
+ record_type);
+
+ /* HW doesn't care about the data in the tag, because it fills it. */
+ dummy_tag_frag.page = skb_frag_page(frag);
+ dummy_tag_frag.offset = 0;
+
+ tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
+ record->end_seq = tp->write_seq + record->len;
+ spin_lock_irq(&offload_ctx->lock);
+ list_add_tail(&record->list, &offload_ctx->records_list);
+ spin_unlock_irq(&offload_ctx->lock);
+ offload_ctx->open_record = NULL;
+ set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+ tls_advance_record_sn(sk, &ctx->tx);
+
+ for (i = 0; i < record->num_frags; i++) {
+ frag = &record->frags[i];
+ sg_unmark_end(&offload_ctx->sg_tx_data[i]);
+ sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
+ frag->size, frag->page_offset);
+ sk_mem_charge(sk, frag->size);
+ get_page(skb_frag_page(frag));
+ }
+ sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
+
+ /* all ready, send */
+ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
+}
+
+static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+ struct page_frag *pfrag,
+ size_t prepend_size)
+{
+ struct tls_record_info *record;
+ skb_frag_t *frag;
+
+ record = kmalloc(sizeof(*record), GFP_KERNEL);
+ if (!record)
+ return -ENOMEM;
+
+ frag = &record->frags[0];
+ __skb_frag_set_page(frag, pfrag->page);
+ frag->page_offset = pfrag->offset;
+ skb_frag_size_set(frag, prepend_size);
+
+ get_page(pfrag->page);
+ pfrag->offset += prepend_size;
+
+ record->num_frags = 1;
+ record->len = prepend_size;
+ offload_ctx->open_record = record;
+ return 0;
+}
+
+static int tls_do_allocation(struct sock *sk,
+ struct tls_offload_context *offload_ctx,
+ struct page_frag *pfrag,
+ size_t prepend_size)
+{
+ int ret;
+
+ if (!offload_ctx->open_record) {
+ if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
+ sk->sk_allocation))) {
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return -ENOMEM;
+ }
+
+ ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
+ if (ret)
+ return ret;
+
+ if (pfrag->size > pfrag->offset)
+ return 0;
+ }
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int tls_push_data(struct sock *sk,
+ struct iov_iter *msg_iter,
+ size_t size, int flags,
+ unsigned char record_type)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+ int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
+ struct tls_record_info *record = ctx->open_record;
+ struct page_frag *pfrag;
+ size_t orig_size = size;
+ u32 max_open_record_len;
+ int copy, rc = 0;
+ bool done = false;
+ long timeo;
+
+ if (flags &
+ ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
+ return -ENOTSUPP;
+
+ if (sk->sk_err)
+ return -sk->sk_err;
+
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
+ if (rc < 0)
+ return rc;
+
+ pfrag = sk_page_frag(sk);
+
+ /* TLS_HEADER_SIZE is not counted as part of the TLS record, and
+ * we need to leave room for an authentication tag.
+ */
+ max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ tls_ctx->tx.prepend_size;
+ do {
+ rc = tls_do_allocation(sk, ctx, pfrag,
+ tls_ctx->tx.prepend_size);
+ if (rc) {
+ rc = sk_stream_wait_memory(sk, &timeo);
+ if (!rc)
+ continue;
+
+ record = ctx->open_record;
+ if (!record)
+ break;
+handle_error:
+ if (record_type != TLS_RECORD_TYPE_DATA) {
+ /* avoid sending partial
+ * record with type !=
+ * application_data
+ */
+ size = orig_size;
+ destroy_record(record);
+ ctx->open_record = NULL;
+ } else if (record->len > tls_ctx->tx.prepend_size) {
+ goto last_record;
+ }
+
+ break;
+ }
+
+ record = ctx->open_record;
+ copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
+ copy = min_t(size_t, copy, (max_open_record_len - record->len));
+
+ if (copy_from_iter_nocache(page_address(pfrag->page) +
+ pfrag->offset,
+ copy, msg_iter) != copy) {
+ rc = -EFAULT;
+ goto handle_error;
+ }
+ tls_append_frag(record, pfrag, copy);
+
+ size -= copy;
+ if (!size) {
+last_record:
+ tls_push_record_flags = flags;
+ if (more) {
+ tls_ctx->pending_open_record_frags =
+ record->num_frags;
+ break;
+ }
+
+ done = true;
+ }
+
+ if (done || record->len >= max_open_record_len ||
+ (record->num_frags >= MAX_SKB_FRAGS - 1)) {
+ rc = tls_push_record(sk,
+ tls_ctx,
+ ctx,
+ record,
+ pfrag,
+ tls_push_record_flags,
+ record_type);
+ if (rc < 0)
+ break;
+ }
+ } while (!done);
+
+ if (orig_size - size > 0)
+ rc = orig_size - size;
+
+ return rc;
+}
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ unsigned char record_type = TLS_RECORD_TYPE_DATA;
+ int rc;
+
+ lock_sock(sk);
+
+ if (unlikely(msg->msg_controllen)) {
+ rc = tls_proccess_cmsg(sk, msg, &record_type);
+ if (rc)
+ goto out;
+ }
+
+ rc = tls_push_data(sk, &msg->msg_iter, size,
+ msg->msg_flags, record_type);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+int tls_device_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct iov_iter msg_iter;
+ char *kaddr = kmap(page);
+ struct kvec iov;
+ int rc;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ lock_sock(sk);
+
+ if (flags & MSG_OOB) {
+ rc = -ENOTSUPP;
+ goto out;
+ }
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
+ rc = tls_push_data(sk, &msg_iter, size,
+ flags, TLS_RECORD_TYPE_DATA);
+ kunmap(page);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+ u32 seq, u64 *p_record_sn)
+{
+ u64 record_sn = context->hint_record_sn;
+ struct tls_record_info *info;
+
+ info = context->retransmit_hint;
+ if (!info ||
+ before(seq, info->end_seq - info->len)) {
+ /* if retransmit_hint is irrelevant start
+ * from the beggining of the list
+ */
+ info = list_first_entry(&context->records_list,
+ struct tls_record_info, list);
+ record_sn = context->unacked_record_sn;
+ }
+
+ list_for_each_entry_from(info, &context->records_list, list) {
+ if (before(seq, info->end_seq)) {
+ if (!context->retransmit_hint ||
+ after(info->end_seq,
+ context->retransmit_hint->end_seq)) {
+ context->hint_record_sn = record_sn;
+ context->retransmit_hint = info;
+ }
+ *p_record_sn = record_sn;
+ return info;
+ }
+ record_sn++;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(tls_get_record);
+
+static int tls_device_push_pending_record(struct sock *sk, int flags)
+{
+ struct iov_iter msg_iter;
+
+ iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+ return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
+}
+
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+ u16 nonce_size, tag_size, iv_size, rec_seq_size;
+ struct tls_record_info *start_marker_record;
+ struct tls_offload_context *offload_ctx;
+ struct tls_crypto_info *crypto_info;
+ struct net_device *netdev;
+ char *iv, *rec_seq;
+ struct sk_buff *skb;
+ int rc = -EINVAL;
+ __be64 rcd_sn;
+
+ if (!ctx)
+ goto out;
+
+ if (ctx->priv_ctx_tx) {
+ rc = -EEXIST;
+ goto out;
+ }
+
+ start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
+ if (!start_marker_record) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+ if (!offload_ctx) {
+ rc = -ENOMEM;
+ goto free_marker_record;
+ }
+
+ crypto_info = &ctx->crypto_send;
+ switch (crypto_info->cipher_type) {
+ case TLS_CIPHER_AES_GCM_128:
+ nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+ rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+ rec_seq =
+ ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+ break;
+ default:
+ rc = -EINVAL;
+ goto free_offload_ctx;
+ }
+
+ ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
+ ctx->tx.tag_size = tag_size;
+ ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
+ ctx->tx.iv_size = iv_size;
+ ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+ GFP_KERNEL);
+ if (!ctx->tx.iv) {
+ rc = -ENOMEM;
+ goto free_offload_ctx;
+ }
+
+ memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+
+ ctx->tx.rec_seq_size = rec_seq_size;
+ ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ if (!ctx->tx.rec_seq) {
+ rc = -ENOMEM;
+ goto free_iv;
+ }
+ memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
+
+ rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
+ if (rc)
+ goto free_rec_seq;
+
+ /* start at rec_seq - 1 to account for the start marker record */
+ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
+ offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
+
+ start_marker_record->end_seq = tcp_sk(sk)->write_seq;
+ start_marker_record->len = 0;
+ start_marker_record->num_frags = 0;
+
+ INIT_LIST_HEAD(&offload_ctx->records_list);
+ list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
+ spin_lock_init(&offload_ctx->lock);
+ sg_init_table(offload_ctx->sg_tx_data,
+ ARRAY_SIZE(offload_ctx->sg_tx_data));
+
+ clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
+ ctx->push_pending_record = tls_device_push_pending_record;
+ offload_ctx->sk_destruct = sk->sk_destruct;
+
+ /* TLS offload is greatly simplified if we don't send
+ * SKBs where only part of the payload needs to be encrypted.
+ * So mark the last skb in the write queue as end of record.
+ */
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ TCP_SKB_CB(skb)->eor = 1;
+
+ refcount_set(&ctx->refcount, 1);
+
+ /* We support starting offload on multiple sockets
+ * concurrently, so we only need a read lock here.
+ * This lock must precede get_netdev_for_sock to prevent races between
+ * NETDEV_DOWN and setsockopt.
+ */
+ down_read(&device_offload_lock);
+ netdev = get_netdev_for_sock(sk);
+ if (!netdev) {
+ pr_err_ratelimited("%s: netdev not found\n", __func__);
+ rc = -EINVAL;
+ goto release_lock;
+ }
+
+ if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
+ rc = -ENOTSUPP;
+ goto release_netdev;
+ }
+
+ /* Avoid offloading if the device is down
+ * We don't want to offload new flows after
+ * the NETDEV_DOWN event
+ */
+ if (!(netdev->flags & IFF_UP)) {
+ rc = -EINVAL;
+ goto release_netdev;
+ }
+
+ ctx->priv_ctx_tx = offload_ctx;
+ rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
+ &ctx->crypto_send,
+ tcp_sk(sk)->write_seq);
+ if (rc)
+ goto release_netdev;
+
+ ctx->netdev = netdev;
+
+ spin_lock_irq(&tls_device_lock);
+ list_add_tail(&ctx->list, &tls_device_list);
+ spin_unlock_irq(&tls_device_lock);
+
+ sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
+ /* following this assignment tls_is_sk_tx_device_offloaded
+ * will return true and the context might be accessed
+ * by the netdev's xmit function.
+ */
+ smp_store_release(&sk->sk_destruct,
+ &tls_device_sk_destruct);
+ up_read(&device_offload_lock);
+ goto out;
+
+release_netdev:
+ dev_put(netdev);
+release_lock:
+ up_read(&device_offload_lock);
+ clean_acked_data_disable(inet_csk(sk));
+ crypto_free_aead(offload_ctx->aead_send);
+free_rec_seq:
+ kfree(ctx->tx.rec_seq);
+free_iv:
+ kfree(ctx->tx.iv);
+free_offload_ctx:
+ kfree(offload_ctx);
+ ctx->priv_ctx_tx = NULL;
+free_marker_record:
+ kfree(start_marker_record);
+out:
+ return rc;
+}
+
+static int tls_device_down(struct net_device *netdev)
+{
+ struct tls_context *ctx, *tmp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ /* Request a write lock to block new offload attempts */
+ down_write(&device_offload_lock);
+
+ spin_lock_irqsave(&tls_device_lock, flags);
+ list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
+ if (ctx->netdev != netdev ||
+ !refcount_inc_not_zero(&ctx->refcount))
+ continue;
+
+ list_move(&ctx->list, &list);
+ }
+ spin_unlock_irqrestore(&tls_device_lock, flags);
+
+ list_for_each_entry_safe(ctx, tmp, &list, list) {
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ ctx->netdev = NULL;
+ dev_put(netdev);
+ list_del_init(&ctx->list);
+
+ if (refcount_dec_and_test(&ctx->refcount))
+ tls_device_free_ctx(ctx);
+ }
+
+ up_write(&device_offload_lock);
+
+ flush_work(&tls_device_gc_work);
+
+ return NOTIFY_DONE;
+}
+
+static int tls_dev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!(dev->features & NETIF_F_HW_TLS_TX))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_FEAT_CHANGE:
+ if (dev->tlsdev_ops &&
+ dev->tlsdev_ops->tls_dev_add &&
+ dev->tlsdev_ops->tls_dev_del)
+ return NOTIFY_DONE;
+ else
+ return NOTIFY_BAD;
+ case NETDEV_DOWN:
+ return tls_device_down(dev);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tls_dev_notifier = {
+ .notifier_call = tls_dev_event,
+};
+
+void __init tls_device_init(void)
+{
+ register_netdevice_notifier(&tls_dev_notifier);
+}
+
+void __exit tls_device_cleanup(void)
+{
+ unregister_netdevice_notifier(&tls_dev_notifier);
+ flush_work(&tls_device_gc_work);
+}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
new file mode 100644
index 000000000000..748914abdb60
--- /dev/null
+++ b/net/tls/tls_device_fallback.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tls.h>
+#include <crypto/aead.h>
+#include <crypto/scatterwalk.h>
+#include <net/ip6_checksum.h>
+
+static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
+{
+ struct scatterlist *src = walk->sg;
+ int diff = walk->offset - src->offset;
+
+ sg_set_page(sg, sg_page(src),
+ src->length - diff, walk->offset);
+
+ scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
+}
+
+static int tls_enc_record(struct aead_request *aead_req,
+ struct crypto_aead *aead, char *aad,
+ char *iv, __be64 rcd_sn,
+ struct scatter_walk *in,
+ struct scatter_walk *out, int *in_len)
+{
+ unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
+ struct scatterlist sg_in[3];
+ struct scatterlist sg_out[3];
+ u16 len;
+ int rc;
+
+ len = min_t(int, *in_len, ARRAY_SIZE(buf));
+
+ scatterwalk_copychunks(buf, in, len, 0);
+ scatterwalk_copychunks(buf, out, len, 1);
+
+ *in_len -= len;
+ if (!*in_len)
+ return 0;
+
+ scatterwalk_pagedone(in, 0, 1);
+ scatterwalk_pagedone(out, 1, 1);
+
+ len = buf[4] | (buf[3] << 8);
+ len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
+
+ tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
+ (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
+
+ memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
+ TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+ sg_init_table(sg_in, ARRAY_SIZE(sg_in));
+ sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+ sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
+ sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
+ chain_to_walk(sg_in + 1, in);
+ chain_to_walk(sg_out + 1, out);
+
+ *in_len -= len;
+ if (*in_len < 0) {
+ *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ /* the input buffer doesn't contain the entire record.
+ * trim len accordingly. The resulting authentication tag
+ * will contain garbage, but we don't care, so we won't
+ * include any of it in the output skb
+ * Note that we assume the output buffer length
+ * is larger then input buffer length + tag size
+ */
+ if (*in_len < 0)
+ len += *in_len;
+
+ *in_len = 0;
+ }
+
+ if (*in_len) {
+ scatterwalk_copychunks(NULL, in, len, 2);
+ scatterwalk_pagedone(in, 0, 1);
+ scatterwalk_copychunks(NULL, out, len, 2);
+ scatterwalk_pagedone(out, 1, 1);
+ }
+
+ len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
+
+ rc = crypto_aead_encrypt(aead_req);
+
+ return rc;
+}
+
+static void tls_init_aead_request(struct aead_request *aead_req,
+ struct crypto_aead *aead)
+{
+ aead_request_set_tfm(aead_req, aead);
+ aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+}
+
+static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
+ gfp_t flags)
+{
+ unsigned int req_size = sizeof(struct aead_request) +
+ crypto_aead_reqsize(aead);
+ struct aead_request *aead_req;
+
+ aead_req = kzalloc(req_size, flags);
+ if (aead_req)
+ tls_init_aead_request(aead_req, aead);
+ return aead_req;
+}
+
+static int tls_enc_records(struct aead_request *aead_req,
+ struct crypto_aead *aead, struct scatterlist *sg_in,
+ struct scatterlist *sg_out, char *aad, char *iv,
+ u64 rcd_sn, int len)
+{
+ struct scatter_walk out, in;
+ int rc;
+
+ scatterwalk_start(&in, sg_in);
+ scatterwalk_start(&out, sg_out);
+
+ do {
+ rc = tls_enc_record(aead_req, aead, aad, iv,
+ cpu_to_be64(rcd_sn), &in, &out, &len);
+ rcd_sn++;
+
+ } while (rc == 0 && len);
+
+ scatterwalk_done(&in, 0, 0);
+ scatterwalk_done(&out, 1, 0);
+
+ return rc;
+}
+
+/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
+ * might have been changed by NAT.
+ */
+static void update_chksum(struct sk_buff *skb, int headln)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+ int datalen = skb->len - headln;
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+
+ /* We only changed the payload so if we are using partial we don't
+ * need to update anything.
+ */
+ if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
+ return;
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (skb->sk->sk_family == AF_INET6) {
+ ipv6h = ipv6_hdr(skb);
+ th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, IPPROTO_TCP, 0);
+ } else {
+ iph = ip_hdr(skb);
+ th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+ IPPROTO_TCP, 0);
+ }
+}
+
+static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
+{
+ skb_copy_header(nskb, skb);
+
+ skb_put(nskb, skb->len);
+ memcpy(nskb->data, skb->data, headln);
+ update_chksum(nskb, headln);
+
+ nskb->destructor = skb->destructor;
+ nskb->sk = skb->sk;
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ refcount_add(nskb->truesize - skb->truesize,
+ &nskb->sk->sk_wmem_alloc);
+}
+
+/* This function may be called after the user socket is already
+ * closed so make sure we don't use anything freed during
+ * tls_sk_proto_close here
+ */
+
+static int fill_sg_in(struct scatterlist *sg_in,
+ struct sk_buff *skb,
+ struct tls_offload_context *ctx,
+ u64 *rcd_sn,
+ s32 *sync_size,
+ int *resync_sgs)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int payload_len = skb->len - tcp_payload_offset;
+ u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ struct tls_record_info *record;
+ unsigned long flags;
+ int remaining;
+ int i;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ record = tls_get_record(ctx, tcp_seq, rcd_sn);
+ if (!record) {
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ WARN(1, "Record not found for seq %u\n", tcp_seq);
+ return -EINVAL;
+ }
+
+ *sync_size = tcp_seq - tls_record_start_seq(record);
+ if (*sync_size < 0) {
+ int is_start_marker = tls_record_is_start_marker(record);
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ /* This should only occur if the relevant record was
+ * already acked. In that case it should be ok
+ * to drop the packet and avoid retransmission.
+ *
+ * There is a corner case where the packet contains
+ * both an acked and a non-acked record.
+ * We currently don't handle that case and rely
+ * on TCP to retranmit a packet that doesn't contain
+ * already acked payload.
+ */
+ if (!is_start_marker)
+ *sync_size = 0;
+ return -EINVAL;
+ }
+
+ remaining = *sync_size;
+ for (i = 0; remaining > 0; i++) {
+ skb_frag_t *frag = &record->frags[i];
+
+ __skb_frag_ref(frag);
+ sg_set_page(sg_in + i, skb_frag_page(frag),
+ skb_frag_size(frag), frag->page_offset);
+
+ remaining -= skb_frag_size(frag);
+
+ if (remaining < 0)
+ sg_in[i].length += remaining;
+ }
+ *resync_sgs = i;
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void fill_sg_out(struct scatterlist sg_out[3], void *buf,
+ struct tls_context *tls_ctx,
+ struct sk_buff *nskb,
+ int tcp_payload_offset,
+ int payload_len,
+ int sync_size,
+ void *dummy_buf)
+{
+ sg_set_buf(&sg_out[0], dummy_buf, sync_size);
+ sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len);
+ /* Add room for authentication tag produced by crypto */
+ dummy_buf += sync_size;
+ sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+}
+
+static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
+ struct scatterlist sg_out[3],
+ struct scatterlist *sg_in,
+ struct sk_buff *skb,
+ s32 sync_size, u64 rcd_sn)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int payload_len = skb->len - tcp_payload_offset;
+ void *buf, *iv, *aad, *dummy_buf;
+ struct aead_request *aead_req;
+ struct sk_buff *nskb = NULL;
+ int buf_len;
+
+ aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
+ if (!aead_req)
+ return NULL;
+
+ buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE +
+ TLS_AAD_SPACE_SIZE +
+ sync_size +
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf)
+ goto free_req;
+
+ iv = buf;
+ memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+ TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+ aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE;
+ dummy_buf = aad + TLS_AAD_SPACE_SIZE;
+
+ nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
+ if (!nskb)
+ goto free_buf;
+
+ skb_reserve(nskb, skb_headroom(skb));
+
+ fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset,
+ payload_len, sync_size, dummy_buf);
+
+ if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
+ rcd_sn, sync_size + payload_len) < 0)
+ goto free_nskb;
+
+ complete_skb(nskb, skb, tcp_payload_offset);
+
+ /* validate_xmit_skb_list assumes that if the skb wasn't segmented
+ * nskb->prev will point to the skb itself
+ */
+ nskb->prev = nskb;
+
+free_buf:
+ kfree(buf);
+free_req:
+ kfree(aead_req);
+ return nskb;
+free_nskb:
+ kfree_skb(nskb);
+ nskb = NULL;
+ goto free_buf;
+}
+
+static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
+{
+ int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ int payload_len = skb->len - tcp_payload_offset;
+ struct scatterlist *sg_in, sg_out[3];
+ struct sk_buff *nskb = NULL;
+ int sg_in_max_elements;
+ int resync_sgs = 0;
+ s32 sync_size = 0;
+ u64 rcd_sn;
+
+ /* worst case is:
+ * MAX_SKB_FRAGS in tls_record_info
+ * MAX_SKB_FRAGS + 1 in SKB head and frags.
+ */
+ sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
+
+ if (!payload_len)
+ return skb;
+
+ sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC);
+ if (!sg_in)
+ goto free_orig;
+
+ sg_init_table(sg_in, sg_in_max_elements);
+ sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+
+ if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) {
+ /* bypass packets before kernel TLS socket option was set */
+ if (sync_size < 0 && payload_len <= -sync_size)
+ nskb = skb_get(skb);
+ goto put_sg;
+ }
+
+ nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn);
+
+put_sg:
+ while (resync_sgs)
+ put_page(sg_page(&sg_in[--resync_sgs]));
+ kfree(sg_in);
+free_orig:
+ kfree_skb(skb);
+ return nskb;
+}
+
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+ struct net_device *dev,
+ struct sk_buff *skb)
+{
+ if (dev == tls_get_ctx(sk)->netdev)
+ return skb;
+
+ return tls_sw_fallback(sk, skb);
+}
+
+int tls_sw_fallback_init(struct sock *sk,
+ struct tls_offload_context *offload_ctx,
+ struct tls_crypto_info *crypto_info)
+{
+ const u8 *key;
+ int rc;
+
+ offload_ctx->aead_send =
+ crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(offload_ctx->aead_send)) {
+ rc = PTR_ERR(offload_ctx->aead_send);
+ pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
+ offload_ctx->aead_send = NULL;
+ goto err_out;
+ }
+
+ key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
+
+ rc = crypto_aead_setkey(offload_ctx->aead_send, key,
+ TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+ if (rc)
+ goto free_aead;
+
+ rc = crypto_aead_setauthsize(offload_ctx->aead_send,
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+ if (rc)
+ goto free_aead;
+
+ return 0;
+free_aead:
+ crypto_free_aead(offload_ctx->aead_send);
+err_out:
+ return rc;
+}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 20cd93be6236..301f22430469 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,12 +51,12 @@ enum {
TLSV6,
TLS_NUM_PROTS,
};
-
enum {
TLS_BASE,
- TLS_SW_TX,
- TLS_SW_RX,
- TLS_SW_RXTX,
+ TLS_SW,
+#ifdef CONFIG_TLS_DEVICE
+ TLS_HW,
+#endif
TLS_HW_RECORD,
TLS_NUM_CONFIG,
};
@@ -65,14 +65,14 @@ static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
static LIST_HEAD(device_list);
static DEFINE_MUTEX(device_mutex);
-static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
-static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
- sk->sk_prot = &tls_prots[ip_ver][ctx->conf];
+ sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
}
int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -254,7 +254,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
lock_sock(sk);
sk_proto_close = ctx->sk_proto_close;
- if (ctx->conf == TLS_BASE || ctx->conf == TLS_HW_RECORD) {
+ if ((ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) ||
+ (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)) {
free_ctx = true;
goto skip_tx_cleanup;
}
@@ -275,15 +276,26 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
}
}
- kfree(ctx->tx.rec_seq);
- kfree(ctx->tx.iv);
- kfree(ctx->rx.rec_seq);
- kfree(ctx->rx.iv);
+ /* We need these for tls_sw_fallback handling of other packets */
+ if (ctx->tx_conf == TLS_SW) {
+ kfree(ctx->tx.rec_seq);
+ kfree(ctx->tx.iv);
+ tls_sw_free_resources_tx(sk);
+ }
- if (ctx->conf == TLS_SW_TX ||
- ctx->conf == TLS_SW_RX ||
- ctx->conf == TLS_SW_RXTX) {
- tls_sw_free_resources(sk);
+ if (ctx->rx_conf == TLS_SW) {
+ kfree(ctx->rx.rec_seq);
+ kfree(ctx->rx.iv);
+ tls_sw_free_resources_rx(sk);
+ }
+
+#ifdef CONFIG_TLS_DEVICE
+ if (ctx->tx_conf != TLS_HW) {
+#else
+ {
+#endif
+ kfree(ctx);
+ ctx = NULL;
}
skip_tx_cleanup:
@@ -446,25 +458,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
goto err_crypto_info;
}
- /* currently SW is default, we will have ethtool in future */
if (tx) {
- rc = tls_set_sw_offload(sk, ctx, 1);
- if (ctx->conf == TLS_SW_RX)
- conf = TLS_SW_RXTX;
- else
- conf = TLS_SW_TX;
+#ifdef CONFIG_TLS_DEVICE
+ rc = tls_set_device_offload(sk, ctx);
+ conf = TLS_HW;
+ if (rc) {
+#else
+ {
+#endif
+ rc = tls_set_sw_offload(sk, ctx, 1);
+ conf = TLS_SW;
+ }
} else {
rc = tls_set_sw_offload(sk, ctx, 0);
- if (ctx->conf == TLS_SW_TX)
- conf = TLS_SW_RXTX;
- else
- conf = TLS_SW_RX;
+ conf = TLS_SW;
}
if (rc)
goto err_crypto_info;
- ctx->conf = conf;
+ if (tx)
+ ctx->tx_conf = conf;
+ else
+ ctx->rx_conf = conf;
update_sk_prot(sk, ctx);
if (tx) {
ctx->sk_write_space = sk->sk_write_space;
@@ -540,7 +556,8 @@ static int tls_hw_prot(struct sock *sk)
ctx->hash = sk->sk_prot->hash;
ctx->unhash = sk->sk_prot->unhash;
ctx->sk_proto_close = sk->sk_prot->close;
- ctx->conf = TLS_HW_RECORD;
+ ctx->rx_conf = TLS_HW_RECORD;
+ ctx->tx_conf = TLS_HW_RECORD;
update_sk_prot(sk, ctx);
rc = 1;
break;
@@ -584,29 +601,40 @@ static int tls_hw_hash(struct sock *sk)
return err;
}
-static void build_protos(struct proto *prot, struct proto *base)
+static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
+ struct proto *base)
{
- prot[TLS_BASE] = *base;
- prot[TLS_BASE].setsockopt = tls_setsockopt;
- prot[TLS_BASE].getsockopt = tls_getsockopt;
- prot[TLS_BASE].close = tls_sk_proto_close;
-
- prot[TLS_SW_TX] = prot[TLS_BASE];
- prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
- prot[TLS_SW_TX].sendpage = tls_sw_sendpage;
-
- prot[TLS_SW_RX] = prot[TLS_BASE];
- prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW_RX].close = tls_sk_proto_close;
-
- prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
- prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW_RXTX].close = tls_sk_proto_close;
-
- prot[TLS_HW_RECORD] = *base;
- prot[TLS_HW_RECORD].hash = tls_hw_hash;
- prot[TLS_HW_RECORD].unhash = tls_hw_unhash;
- prot[TLS_HW_RECORD].close = tls_sk_proto_close;
+ prot[TLS_BASE][TLS_BASE] = *base;
+ prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
+ prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
+ prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
+
+ prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg;
+ prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
+
+ prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
+
+ prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
+ prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
+
+#ifdef CONFIG_TLS_DEVICE
+ prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+ prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg;
+ prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage;
+
+ prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
+ prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
+ prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
+#endif
+
+ prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].close = tls_sk_proto_close;
}
static int tls_init(struct sock *sk)
@@ -637,7 +665,7 @@ static int tls_init(struct sock *sk)
ctx->getsockopt = sk->sk_prot->getsockopt;
ctx->sk_proto_close = sk->sk_prot->close;
- /* Build IPv6 TLS whenever the address of tcpv6_prot changes */
+ /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
if (ip_ver == TLSV6 &&
unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
mutex_lock(&tcpv6_prot_mutex);
@@ -648,7 +676,8 @@ static int tls_init(struct sock *sk)
mutex_unlock(&tcpv6_prot_mutex);
}
- ctx->conf = TLS_BASE;
+ ctx->tx_conf = TLS_BASE;
+ ctx->rx_conf = TLS_BASE;
update_sk_prot(sk, ctx);
out:
return rc;
@@ -686,6 +715,9 @@ static int __init tls_register(void)
tls_sw_proto_ops.poll = tls_sw_poll;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+#ifdef CONFIG_TLS_DEVICE
+ tls_device_init();
+#endif
tcp_register_ulp(&tcp_tls_ulp_ops);
return 0;
@@ -694,6 +726,9 @@ static int __init tls_register(void)
static void __exit tls_unregister(void)
{
tcp_unregister_ulp(&tcp_tls_ulp_ops);
+#ifdef CONFIG_TLS_DEVICE
+ tls_device_cleanup();
+#endif
}
module_init(tls_register);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index e1c93ce74e0f..8ca57d01b18f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk,
gfp_t flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = strp_msg(skb);
struct aead_request *aead_req;
@@ -122,7 +122,7 @@ out:
static void trim_both_sgl(struct sock *sk, int target_size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
trim_sg(sk, ctx->sg_plaintext_data,
&ctx->sg_plaintext_num_elem,
@@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
static int alloc_encrypted_sg(struct sock *sk, int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc = 0;
rc = sk_alloc_sg(sk, len,
@@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
static int alloc_plaintext_sg(struct sock *sk, int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc = 0;
rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
@@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg,
static void tls_free_both_sg(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
&ctx->sg_encrypted_size);
@@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk)
}
static int tls_do_encryption(struct tls_context *tls_ctx,
- struct tls_sw_context *ctx, size_t data_len,
+ struct tls_sw_context_tx *ctx, size_t data_len,
gfp_t flags)
{
unsigned int req_size = sizeof(struct aead_request) +
@@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int rc;
sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
@@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
int bytes)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct scatterlist *sg = ctx->sg_plaintext_data;
int copy, i, rc = 0;
@@ -367,7 +367,7 @@ out:
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int ret = 0;
int required_size;
long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
int ret = 0;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
bool eor;
@@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
long timeo, int *err)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct sk_buff *skb;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
struct scatterlist *sgout)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
struct scatterlist *sgin = &sgin_arr[0];
@@ -692,8 +692,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
if (!sgout) {
nsg = skb_cow_data(skb, 0, &unused) + 1;
sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
- if (!sgout)
- sgout = sgin;
+ sgout = sgin;
}
sg_init_table(sgin, nsg);
@@ -723,7 +722,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
unsigned int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = strp_msg(skb);
if (len < rxm->full_len) {
@@ -736,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
/* Finished with message */
ctx->recv_pkt = NULL;
kfree_skb(skb);
- strp_unpause(&ctx->strp);
+ __strp_unpause(&ctx->strp);
return true;
}
@@ -749,7 +748,7 @@ int tls_sw_recvmsg(struct sock *sk,
int *addr_len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
unsigned char control;
struct strp_msg *rxm;
struct sk_buff *skb;
@@ -869,7 +868,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
size_t len, unsigned int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm = NULL;
struct sock *sk = sock->sk;
struct sk_buff *skb;
@@ -922,7 +921,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
unsigned int ret;
struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
/* Grab POLLOUT and POLLHUP from the underlying socket */
ret = ctx->sk_poll(file, sock, wait);
@@ -938,7 +937,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
char header[tls_ctx->rx.prepend_size];
struct strp_msg *rxm = strp_msg(skb);
size_t cipher_overhead;
@@ -987,7 +986,7 @@ read_failure:
static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct strp_msg *rxm;
rxm = strp_msg(skb);
@@ -1003,18 +1002,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
static void tls_data_ready(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
strp_data_ready(&ctx->strp);
}
-void tls_sw_free_resources(struct sock *sk)
+void tls_sw_free_resources_tx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
if (ctx->aead_send)
crypto_free_aead(ctx->aead_send);
+ tls_free_both_sg(sk);
+
+ kfree(ctx);
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
if (ctx->aead_recv) {
if (ctx->recv_pkt) {
kfree_skb(ctx->recv_pkt);
@@ -1030,10 +1039,7 @@ void tls_sw_free_resources(struct sock *sk)
lock_sock(sk);
}
- tls_free_both_sg(sk);
-
kfree(ctx);
- kfree(tls_ctx);
}
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
@@ -1041,7 +1047,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
struct tls_crypto_info *crypto_info;
struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
- struct tls_sw_context *sw_ctx;
+ struct tls_sw_context_tx *sw_ctx_tx = NULL;
+ struct tls_sw_context_rx *sw_ctx_rx = NULL;
struct cipher_context *cctx;
struct crypto_aead **aead;
struct strp_callbacks cb;
@@ -1054,27 +1061,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto out;
}
- if (!ctx->priv_ctx) {
- sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
- if (!sw_ctx) {
+ if (tx) {
+ sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+ if (!sw_ctx_tx) {
rc = -ENOMEM;
goto out;
}
- crypto_init_wait(&sw_ctx->async_wait);
+ crypto_init_wait(&sw_ctx_tx->async_wait);
+ ctx->priv_ctx_tx = sw_ctx_tx;
} else {
- sw_ctx = ctx->priv_ctx;
+ sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+ if (!sw_ctx_rx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ crypto_init_wait(&sw_ctx_rx->async_wait);
+ ctx->priv_ctx_rx = sw_ctx_rx;
}
- ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
-
if (tx) {
crypto_info = &ctx->crypto_send;
cctx = &ctx->tx;
- aead = &sw_ctx->aead_send;
+ aead = &sw_ctx_tx->aead_send;
} else {
crypto_info = &ctx->crypto_recv;
cctx = &ctx->rx;
- aead = &sw_ctx->aead_recv;
+ aead = &sw_ctx_rx->aead_recv;
}
switch (crypto_info->cipher_type) {
@@ -1121,22 +1133,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
- if (tx) {
- sg_init_table(sw_ctx->sg_encrypted_data,
- ARRAY_SIZE(sw_ctx->sg_encrypted_data));
- sg_init_table(sw_ctx->sg_plaintext_data,
- ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
- sg_init_table(sw_ctx->sg_aead_in, 2);
- sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
- sizeof(sw_ctx->aad_space));
- sg_unmark_end(&sw_ctx->sg_aead_in[1]);
- sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
- sg_init_table(sw_ctx->sg_aead_out, 2);
- sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
- sizeof(sw_ctx->aad_space));
- sg_unmark_end(&sw_ctx->sg_aead_out[1]);
- sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+ if (sw_ctx_tx) {
+ sg_init_table(sw_ctx_tx->sg_encrypted_data,
+ ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
+ sg_init_table(sw_ctx_tx->sg_plaintext_data,
+ ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
+
+ sg_init_table(sw_ctx_tx->sg_aead_in, 2);
+ sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
+ sizeof(sw_ctx_tx->aad_space));
+ sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
+ sg_chain(sw_ctx_tx->sg_aead_in, 2,
+ sw_ctx_tx->sg_plaintext_data);
+ sg_init_table(sw_ctx_tx->sg_aead_out, 2);
+ sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
+ sizeof(sw_ctx_tx->aad_space));
+ sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
+ sg_chain(sw_ctx_tx->sg_aead_out, 2,
+ sw_ctx_tx->sg_encrypted_data);
}
if (!*aead) {
@@ -1161,22 +1175,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (rc)
goto free_aead;
- if (!tx) {
+ if (sw_ctx_rx) {
/* Set up strparser */
memset(&cb, 0, sizeof(cb));
cb.rcv_msg = tls_queue;
cb.parse_msg = tls_read_size;
- strp_init(&sw_ctx->strp, sk, &cb);
+ strp_init(&sw_ctx_rx->strp, sk, &cb);
write_lock_bh(&sk->sk_callback_lock);
- sw_ctx->saved_data_ready = sk->sk_data_ready;
+ sw_ctx_rx->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = tls_data_ready;
write_unlock_bh(&sk->sk_callback_lock);
- sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+ sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
- strp_check_rcv(&sw_ctx->strp);
+ strp_check_rcv(&sw_ctx_rx->strp);
}
goto out;
@@ -1188,11 +1202,16 @@ free_rec_seq:
kfree(cctx->rec_seq);
cctx->rec_seq = NULL;
free_iv:
- kfree(ctx->tx.iv);
- ctx->tx.iv = NULL;
+ kfree(cctx->iv);
+ cctx->iv = NULL;
free_priv:
- kfree(ctx->priv_ctx);
- ctx->priv_ctx = NULL;
+ if (tx) {
+ kfree(ctx->priv_ctx_tx);
+ ctx->priv_ctx_tx = NULL;
+ } else {
+ kfree(ctx->priv_ctx_rx);
+ ctx->priv_ctx_rx = NULL;
+ }
out:
return rc;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c0fd8a85e7f7..5fe35aafdd9c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy)
(!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
return -EINVAL;
+ if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
+ rdev->ops->update_connect_params))
+ return -EINVAL;
+
if (wiphy->addresses)
memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7c5135a92d76..07514ca011b2 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4,6 +4,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*/
#include <linux/if.h>
@@ -423,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
+
+ [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
+ [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
+ [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
};
/* policy for the key attributes */
@@ -645,7 +650,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
}
-static int nl80211_msg_put_channel(struct sk_buff *msg,
+static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
+ const struct ieee80211_reg_rule *rule)
+{
+ int j;
+ struct nlattr *nl_wmm_rules =
+ nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM);
+
+ if (!nl_wmm_rules)
+ goto nla_put_failure;
+
+ for (j = 0; j < IEEE80211_NUM_ACS; j++) {
+ struct nlattr *nl_wmm_rule = nla_nest_start(msg, j);
+
+ if (!nl_wmm_rule)
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
+ rule->wmm_rule->client[j].cw_min) ||
+ nla_put_u16(msg, NL80211_WMMR_CW_MAX,
+ rule->wmm_rule->client[j].cw_max) ||
+ nla_put_u8(msg, NL80211_WMMR_AIFSN,
+ rule->wmm_rule->client[j].aifsn) ||
+ nla_put_u8(msg, NL80211_WMMR_TXOP,
+ rule->wmm_rule->client[j].cot))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_wmm_rule);
+ }
+ nla_nest_end(msg, nl_wmm_rules);
+
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
struct ieee80211_channel *chan,
bool large)
{
@@ -721,12 +762,55 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
DBM_TO_MBM(chan->max_power)))
goto nla_put_failure;
+ if (large) {
+ const struct ieee80211_reg_rule *rule =
+ freq_reg_info(wiphy, chan->center_freq);
+
+ if (!IS_ERR(rule) && rule->wmm_rule) {
+ if (nl80211_msg_put_wmm_rules(msg, rule))
+ goto nla_put_failure;
+ }
+ }
+
return 0;
nla_put_failure:
return -ENOBUFS;
}
+static bool nl80211_put_txq_stats(struct sk_buff *msg,
+ struct cfg80211_txq_stats *txqstats,
+ int attrtype)
+{
+ struct nlattr *txqattr;
+
+#define PUT_TXQVAL_U32(attr, memb) do { \
+ if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) && \
+ nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
+ return false; \
+ } while (0)
+
+ txqattr = nla_nest_start(msg, attrtype);
+ if (!txqattr)
+ return false;
+
+ PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
+ PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
+ PUT_TXQVAL_U32(FLOWS, flows);
+ PUT_TXQVAL_U32(DROPS, drops);
+ PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
+ PUT_TXQVAL_U32(OVERLIMIT, overlimit);
+ PUT_TXQVAL_U32(OVERMEMORY, overmemory);
+ PUT_TXQVAL_U32(COLLISIONS, collisions);
+ PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
+ PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
+ PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
+ nla_nest_end(msg, txqattr);
+
+#undef PUT_TXQVAL_U32
+ return true;
+}
+
/* netlink command implementations */
struct key_parse {
@@ -1631,7 +1715,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
chan = &sband->channels[i];
if (nl80211_msg_put_channel(
- msg, chan,
+ msg, &rdev->wiphy, chan,
state->split))
goto nla_put_failure;
@@ -1926,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->wiphy.nan_supported_bands))
goto nla_put_failure;
+ if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_TXQS)) {
+ struct cfg80211_txq_stats txqstats = {};
+ int res;
+
+ res = rdev_get_txq_stats(rdev, NULL, &txqstats);
+ if (!res &&
+ !nl80211_put_txq_stats(msg, &txqstats,
+ NL80211_ATTR_TXQ_STATS))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
+ rdev->wiphy.txq_limit))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
+ rdev->wiphy.txq_memory_limit))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
+ rdev->wiphy.txq_quantum))
+ goto nla_put_failure;
+ }
+
/* done */
state->split_start = 0;
break;
@@ -2303,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
u8 retry_short = 0, retry_long = 0;
u32 frag_threshold = 0, rts_threshold = 0;
u8 coverage_class = 0;
+ u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
ASSERT_RTNL();
@@ -2509,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
changed |= WIPHY_PARAM_DYN_ACK;
}
+ if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+ txq_limit = nla_get_u32(
+ info->attrs[NL80211_ATTR_TXQ_LIMIT]);
+ changed |= WIPHY_PARAM_TXQ_LIMIT;
+ }
+
+ if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+ txq_memory_limit = nla_get_u32(
+ info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
+ changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
+ }
+
+ if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+ txq_quantum = nla_get_u32(
+ info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
+ changed |= WIPHY_PARAM_TXQ_QUANTUM;
+ }
+
if (changed) {
u8 old_retry_short, old_retry_long;
u32 old_frag_threshold, old_rts_threshold;
u8 old_coverage_class;
+ u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
if (!rdev->ops->set_wiphy_params)
return -EOPNOTSUPP;
@@ -2522,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
old_frag_threshold = rdev->wiphy.frag_threshold;
old_rts_threshold = rdev->wiphy.rts_threshold;
old_coverage_class = rdev->wiphy.coverage_class;
+ old_txq_limit = rdev->wiphy.txq_limit;
+ old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
+ old_txq_quantum = rdev->wiphy.txq_quantum;
if (changed & WIPHY_PARAM_RETRY_SHORT)
rdev->wiphy.retry_short = retry_short;
@@ -2533,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rdev->wiphy.rts_threshold = rts_threshold;
if (changed & WIPHY_PARAM_COVERAGE_CLASS)
rdev->wiphy.coverage_class = coverage_class;
+ if (changed & WIPHY_PARAM_TXQ_LIMIT)
+ rdev->wiphy.txq_limit = txq_limit;
+ if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
+ rdev->wiphy.txq_memory_limit = txq_memory_limit;
+ if (changed & WIPHY_PARAM_TXQ_QUANTUM)
+ rdev->wiphy.txq_quantum = txq_quantum;
result = rdev_set_wiphy_params(rdev, changed);
if (result) {
@@ -2541,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rdev->wiphy.frag_threshold = old_frag_threshold;
rdev->wiphy.rts_threshold = old_rts_threshold;
rdev->wiphy.coverage_class = old_coverage_class;
+ rdev->wiphy.txq_limit = old_txq_limit;
+ rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
+ rdev->wiphy.txq_quantum = old_txq_quantum;
return result;
}
}
@@ -2662,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
}
wdev_unlock(wdev);
+ if (rdev->ops->get_txq_stats) {
+ struct cfg80211_txq_stats txqstats = {};
+ int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);
+
+ if (ret == 0 &&
+ !nl80211_put_txq_stats(msg, &txqstats,
+ NL80211_ATTR_TXQ_STATS))
+ goto nla_put_failure;
+ }
+
genlmsg_end(msg, hdr);
return 0;
@@ -4494,11 +4651,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO_U64(BEACON_RX, rx_beacon);
PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+ if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
+ PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
#undef PUT_SINFO
#undef PUT_SINFO_U64
- if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) {
+ if (sinfo->pertid) {
struct nlattr *tidsattr;
int tid;
@@ -4532,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
#undef PUT_TIDVAL_U64
+ if ((tidstats->filled &
+ BIT(NL80211_TID_STATS_TXQ_STATS)) &&
+ !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
+ NL80211_TID_STATS_TXQ_STATS))
+ goto nla_put_failure;
+
nla_nest_end(msg, tidattr);
}
@@ -4545,10 +4711,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
sinfo->assoc_req_ies))
goto nla_put_failure;
+ cfg80211_sinfo_release_content(sinfo);
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
+ cfg80211_sinfo_release_content(sinfo);
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
@@ -4630,8 +4798,10 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
return err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
+ if (!msg) {
+ cfg80211_sinfo_release_content(&sinfo);
return -ENOMEM;
+ }
if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
info->snd_portid, info->snd_seq, 0,
@@ -7930,7 +8100,15 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
wdev_lock(wdev);
spin_lock_bh(&rdev->bss_lock);
- cfg80211_bss_expire(rdev);
+
+ /*
+ * dump_scan will be called multiple times to break up the scan results
+ * into multiple messages. It is unlikely that any more bss-es will be
+ * expired after the first call, so only call only call this on the
+ * first dump_scan invocation.
+ */
+ if (start == 0)
+ cfg80211_bss_expire(rdev);
cb->seq = rdev->bss_generation;
@@ -8336,6 +8514,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
const u8 *bssid, *ssid;
int err, ssid_len = 0;
+ if (dev->ieee80211_ptr->conn_owner_nlportid &&
+ dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+ return -EPERM;
+
if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
return -EINVAL;
@@ -8458,6 +8640,10 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
u16 reason_code;
bool local_state_change;
+ if (dev->ieee80211_ptr->conn_owner_nlportid &&
+ dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+ return -EPERM;
+
if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
return -EINVAL;
@@ -8505,6 +8691,10 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
u16 reason_code;
bool local_state_change;
+ if (dev->ieee80211_ptr->conn_owner_nlportid &&
+ dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+ return -EPERM;
+
if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
return -EINVAL;
@@ -9251,6 +9441,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct wireless_dev *wdev = dev->ieee80211_ptr;
+ bool fils_sk_offload;
+ u32 auth_type;
u32 changed = 0;
int ret;
@@ -9265,6 +9457,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
changed |= UPDATE_ASSOC_IES;
}
+ fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);
+
+ /*
+ * when driver supports fils-sk offload all attributes must be
+ * provided. So the else covers "fils-sk-not-all" and
+ * "no-fils-sk-any".
+ */
+ if (fils_sk_offload &&
+ info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
+ info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
+ info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
+ info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+ connect.fils_erp_username =
+ nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+ connect.fils_erp_username_len =
+ nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+ connect.fils_erp_realm =
+ nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+ connect.fils_erp_realm_len =
+ nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+ connect.fils_erp_next_seq_num =
+ nla_get_u16(
+ info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
+ connect.fils_erp_rrk =
+ nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+ connect.fils_erp_rrk_len =
+ nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+ changed |= UPDATE_FILS_ERP_INFO;
+ } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
+ info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
+ info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
+ info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+ auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+ if (!nl80211_valid_auth_type(rdev, auth_type,
+ NL80211_CMD_CONNECT))
+ return -EINVAL;
+
+ if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
+ fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
+ return -EINVAL;
+
+ connect.auth_type = auth_type;
+ changed |= UPDATE_AUTH_TYPE;
+ }
+
wdev_lock(dev->ieee80211_ptr);
if (!wdev->current_bss)
ret = -ENOLINK;
@@ -9282,6 +9524,10 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
u16 reason;
int ret;
+ if (dev->ieee80211_ptr->conn_owner_nlportid &&
+ dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+ return -EPERM;
+
if (!info->attrs[NL80211_ATTR_REASON_CODE])
reason = WLAN_REASON_DEAUTH_LEAVING;
else
@@ -14028,8 +14274,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
void *hdr;
msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
- cr->fils_kek_len + cr->pmk_len +
- (cr->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+ cr->fils.kek_len + cr->fils.pmk_len +
+ (cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
if (!msg)
return;
@@ -14055,17 +14301,17 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
(cr->resp_ie &&
nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
cr->resp_ie)) ||
- (cr->update_erp_next_seq_num &&
+ (cr->fils.update_erp_next_seq_num &&
nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
- cr->fils_erp_next_seq_num)) ||
+ cr->fils.erp_next_seq_num)) ||
(cr->status == WLAN_STATUS_SUCCESS &&
- ((cr->fils_kek &&
- nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len,
- cr->fils_kek)) ||
- (cr->pmk &&
- nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) ||
- (cr->pmkid &&
- nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid)))))
+ ((cr->fils.kek &&
+ nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len,
+ cr->fils.kek)) ||
+ (cr->fils.pmk &&
+ nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) ||
+ (cr->fils.pmkid &&
+ nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -14086,7 +14332,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
void *hdr;
const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
- msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp);
+ msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
+ info->fils.kek_len + info->fils.pmk_len +
+ (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
if (!msg)
return;
@@ -14104,7 +14352,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
info->req_ie)) ||
(info->resp_ie &&
nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
- info->resp_ie)))
+ info->resp_ie)) ||
+ (info->fils.update_erp_next_seq_num &&
+ nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+ info->fils.erp_next_seq_num)) ||
+ (info->fils.kek &&
+ nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
+ info->fils.kek)) ||
+ (info->fils.pmk &&
+ nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
+ (info->fils.pmkid &&
+ nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -14321,7 +14579,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
if (!nl_freq)
goto nla_put_failure;
- if (nl80211_msg_put_channel(msg, channel_before, false))
+
+ if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
goto nla_put_failure;
nla_nest_end(msg, nl_freq);
@@ -14329,7 +14588,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
if (!nl_freq)
goto nla_put_failure;
- if (nl80211_msg_put_channel(msg, channel_after, false))
+
+ if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
goto nla_put_failure;
nla_nest_end(msg, nl_freq);
@@ -14456,8 +14716,10 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
trace_cfg80211_del_sta(dev, mac_addr);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
- if (!msg)
+ if (!msg) {
+ cfg80211_sinfo_release_content(sinfo);
return;
+ }
if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
rdev, dev, mac_addr, sinfo) < 0) {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 87479a53411b..364f5d67f05b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_txq_stats *txqstats)
+{
+ int ret;
+ trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
+ ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 5fcec5c94eb7..bbe6298e4bb9 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1656,7 +1656,7 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
case NL80211_REGDOM_SET_BY_DRIVER:
return "driver";
case NL80211_REGDOM_SET_BY_COUNTRY_IE:
- return "country IE";
+ return "country element";
default:
WARN_ON(1);
return "bug";
@@ -2622,7 +2622,7 @@ reg_process_hint_country_ie(struct wiphy *wiphy,
* This doesn't happen yet, not sure we
* ever want to support it for this case.
*/
- WARN_ONCE(1, "Unexpected intersection for country IEs");
+ WARN_ONCE(1, "Unexpected intersection for country elements");
return REG_REQ_IGNORE;
}
@@ -2772,6 +2772,21 @@ out_free:
reg_free_request(reg_request);
}
+static void notify_self_managed_wiphys(struct regulatory_request *request)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy;
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ wiphy = &rdev->wiphy;
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
+ request->initiator == NL80211_REGDOM_SET_BY_USER &&
+ request->user_reg_hint_type ==
+ NL80211_USER_REG_HINT_CELL_BASE)
+ reg_call_notifier(wiphy, request);
+ }
+}
+
static bool reg_only_self_managed_wiphys(void)
{
struct cfg80211_registered_device *rdev;
@@ -2823,6 +2838,7 @@ static void reg_process_pending_hints(void)
spin_unlock(&reg_requests_lock);
+ notify_self_managed_wiphys(reg_request);
if (reg_only_self_managed_wiphys()) {
reg_free_request(reg_request);
return;
@@ -3387,7 +3403,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
case NL80211_DFS_JP:
return true;
default:
- pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region);
+ pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region);
return false;
}
}
@@ -3702,17 +3718,26 @@ EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl);
void wiphy_regulatory_register(struct wiphy *wiphy)
{
- struct regulatory_request *lr;
+ struct regulatory_request *lr = get_last_request();
- /* self-managed devices ignore external hints */
- if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ /* self-managed devices ignore beacon hints and country IE */
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS |
REGULATORY_COUNTRY_IE_IGNORE;
+ /*
+ * The last request may have been received before this
+ * registration call. Call the driver notifier if
+ * initiator is USER and user type is CELL_BASE.
+ */
+ if (lr->initiator == NL80211_REGDOM_SET_BY_USER &&
+ lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE)
+ reg_call_notifier(wiphy, lr);
+ }
+
if (!reg_dev_ignore_cell_hint(wiphy))
reg_num_devs_support_basehint++;
- lr = get_last_request();
wiphy_update_regulatory(wiphy, lr->initiator);
wiphy_all_share_dfs_chan_state(wiphy);
}
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5df6b33db786..d536b07582f8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -803,8 +803,8 @@ void cfg80211_connect_done(struct net_device *dev,
ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
params->req_ie_len + params->resp_ie_len +
- params->fils_kek_len + params->pmk_len +
- (params->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+ params->fils.kek_len + params->fils.pmk_len +
+ (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
if (!ev) {
cfg80211_put_bss(wdev->wiphy, params->bss);
return;
@@ -831,27 +831,29 @@ void cfg80211_connect_done(struct net_device *dev,
params->resp_ie_len);
next += params->resp_ie_len;
}
- if (params->fils_kek_len) {
- ev->cr.fils_kek = next;
- ev->cr.fils_kek_len = params->fils_kek_len;
- memcpy((void *)ev->cr.fils_kek, params->fils_kek,
- params->fils_kek_len);
- next += params->fils_kek_len;
+ if (params->fils.kek_len) {
+ ev->cr.fils.kek = next;
+ ev->cr.fils.kek_len = params->fils.kek_len;
+ memcpy((void *)ev->cr.fils.kek, params->fils.kek,
+ params->fils.kek_len);
+ next += params->fils.kek_len;
}
- if (params->pmk_len) {
- ev->cr.pmk = next;
- ev->cr.pmk_len = params->pmk_len;
- memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len);
- next += params->pmk_len;
+ if (params->fils.pmk_len) {
+ ev->cr.fils.pmk = next;
+ ev->cr.fils.pmk_len = params->fils.pmk_len;
+ memcpy((void *)ev->cr.fils.pmk, params->fils.pmk,
+ params->fils.pmk_len);
+ next += params->fils.pmk_len;
}
- if (params->pmkid) {
- ev->cr.pmkid = next;
- memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN);
+ if (params->fils.pmkid) {
+ ev->cr.fils.pmkid = next;
+ memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid,
+ WLAN_PMKID_LEN);
next += WLAN_PMKID_LEN;
}
- ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num;
- if (params->update_erp_next_seq_num)
- ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num;
+ ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
+ if (params->fils.update_erp_next_seq_num)
+ ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
if (params->bss)
cfg80211_hold_bss(bss_from_pub(params->bss));
ev->cr.bss = params->bss;
@@ -930,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_event *ev;
unsigned long flags;
+ u8 *next;
if (!info->bss) {
info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
@@ -942,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
if (WARN_ON(!info->bss))
return;
- ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp);
+ ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
+ info->fils.kek_len + info->fils.pmk_len +
+ (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
if (!ev) {
cfg80211_put_bss(wdev->wiphy, info->bss);
return;
}
ev->type = EVENT_ROAMED;
- ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
- ev->rm.req_ie_len = info->req_ie_len;
- memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
- ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len;
- ev->rm.resp_ie_len = info->resp_ie_len;
- memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
+ next = ((u8 *)ev) + sizeof(*ev);
+ if (info->req_ie_len) {
+ ev->rm.req_ie = next;
+ ev->rm.req_ie_len = info->req_ie_len;
+ memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
+ next += info->req_ie_len;
+ }
+ if (info->resp_ie_len) {
+ ev->rm.resp_ie = next;
+ ev->rm.resp_ie_len = info->resp_ie_len;
+ memcpy((void *)ev->rm.resp_ie, info->resp_ie,
+ info->resp_ie_len);
+ next += info->resp_ie_len;
+ }
+ if (info->fils.kek_len) {
+ ev->rm.fils.kek = next;
+ ev->rm.fils.kek_len = info->fils.kek_len;
+ memcpy((void *)ev->rm.fils.kek, info->fils.kek,
+ info->fils.kek_len);
+ next += info->fils.kek_len;
+ }
+ if (info->fils.pmk_len) {
+ ev->rm.fils.pmk = next;
+ ev->rm.fils.pmk_len = info->fils.pmk_len;
+ memcpy((void *)ev->rm.fils.pmk, info->fils.pmk,
+ info->fils.pmk_len);
+ next += info->fils.pmk_len;
+ }
+ if (info->fils.pmkid) {
+ ev->rm.fils.pmkid = next;
+ memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid,
+ WLAN_PMKID_LEN);
+ next += WLAN_PMKID_LEN;
+ }
+ ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
+ if (info->fils.update_erp_next_seq_num)
+ ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
ev->rm.bss = info->bss;
spin_lock_irqsave(&wdev->event_lock, flags);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 55fb279a5196..2b417a2fe63f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast,
WIPHY_PR_ARG, NETDEV_PR_ARG,
BOOL_TO_STR(__entry->enabled))
);
+
+TRACE_EVENT(rdev_get_txq_stats,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d112e9a89364..b5bb1c309914 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1787,6 +1787,17 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
return false;
}
+int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
+{
+ sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
+ IEEE80211_NUM_TIDS + 1, gfp);
+ if (!sinfo->pertid)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats);
+
/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
const unsigned char rfc1042_header[] __aligned(2) =
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
+config XDP_SOCKETS
+ bool "XDP sockets"
+ depends on BPF_SYSCALL
+ default n
+ help
+ XDP sockets allows a channel between XDP programs and
+ userspace applications.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..04f073146256
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..7eb4948a38d2
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+#include "xsk_queue.h"
+
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
+
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&umem->xsk_list_lock, flags);
+ list_add_rcu(&xs->list, &umem->xsk_list);
+ spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
+
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ if (xs->dev) {
+ spin_lock_irqsave(&umem->xsk_list_lock, flags);
+ list_del_rcu(&xs->list);
+ spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+
+ if (umem->zc)
+ synchronize_net();
+ }
+}
+
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+ u32 queue_id, u16 flags)
+{
+ bool force_zc, force_copy;
+ struct netdev_bpf bpf;
+ int err;
+
+ force_zc = flags & XDP_ZEROCOPY;
+ force_copy = flags & XDP_COPY;
+
+ if (force_zc && force_copy)
+ return -EINVAL;
+
+ if (force_copy)
+ return 0;
+
+ dev_hold(dev);
+
+ if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
+ bpf.command = XDP_QUERY_XSK_UMEM;
+
+ rtnl_lock();
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ rtnl_unlock();
+
+ if (err) {
+ dev_put(dev);
+ return force_zc ? -ENOTSUPP : 0;
+ }
+
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = umem;
+ bpf.xsk.queue_id = queue_id;
+
+ rtnl_lock();
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ rtnl_unlock();
+
+ if (err) {
+ dev_put(dev);
+ return force_zc ? err : 0; /* fail or fallback */
+ }
+
+ umem->dev = dev;
+ umem->queue_id = queue_id;
+ umem->zc = true;
+ return 0;
+ }
+
+ dev_put(dev);
+ return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+static void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+ struct netdev_bpf bpf;
+ int err;
+
+ if (umem->dev) {
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = NULL;
+ bpf.xsk.queue_id = umem->queue_id;
+
+ rtnl_lock();
+ err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+ rtnl_unlock();
+
+ if (err)
+ WARN(1, "failed to disable umem!\n");
+
+ dev_put(umem->dev);
+ umem->dev = NULL;
+ }
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+ unsigned int i;
+
+ for (i = 0; i < umem->npgs; i++) {
+ struct page *page = umem->pgs[i];
+
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+ atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+ free_uid(umem->user);
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ xdp_umem_clear_dev(umem);
+
+ if (umem->fq) {
+ xskq_destroy(umem->fq);
+ umem->fq = NULL;
+ }
+
+ if (umem->cq) {
+ xskq_destroy(umem->cq);
+ umem->cq = NULL;
+ }
+
+ xdp_umem_unpin_pages(umem);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ mmput(mm);
+ kfree(umem->pages);
+ umem->pages = NULL;
+
+ xdp_umem_unaccount_pages(umem);
+out:
+ kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+ struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+ xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+ refcount_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+ if (!umem)
+ return;
+
+ if (refcount_dec_and_test(&umem->users)) {
+ INIT_WORK(&umem->work, xdp_umem_release_deferred);
+ schedule_work(&umem->work);
+ }
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+ unsigned int gup_flags = FOLL_WRITE;
+ long npgs;
+ int err;
+
+ umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+ if (!umem->pgs)
+ return -ENOMEM;
+
+ down_write(&current->mm->mmap_sem);
+ npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags, &umem->pgs[0], NULL);
+ up_write(&current->mm->mmap_sem);
+
+ if (npgs != umem->npgs) {
+ if (npgs >= 0) {
+ umem->npgs = npgs;
+ err = -ENOMEM;
+ goto out_pin;
+ }
+ err = npgs;
+ goto out_pgs;
+ }
+ return 0;
+
+out_pin:
+ xdp_umem_unpin_pages(umem);
+out_pgs:
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+ return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+ unsigned long lock_limit, new_npgs, old_npgs;
+
+ if (capable(CAP_IPC_LOCK))
+ return 0;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ umem->user = get_uid(current_user());
+
+ do {
+ old_npgs = atomic_long_read(&umem->user->locked_vm);
+ new_npgs = old_npgs + umem->npgs;
+ if (new_npgs > lock_limit) {
+ free_uid(umem->user);
+ umem->user = NULL;
+ return -ENOBUFS;
+ }
+ } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+ new_npgs) != old_npgs);
+ return 0;
+}
+
+static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+ u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+ unsigned int chunks, chunks_per_page;
+ u64 addr = mr->addr, size = mr->len;
+ int size_chk, err, i;
+
+ if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
+ /* Strictly speaking we could support this, if:
+ * - huge pages, or*
+ * - using an IOMMU, or
+ * - making sure the memory area is consecutive
+ * but for now, we simply say "computer says no".
+ */
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(chunk_size))
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(addr)) {
+ /* Memory area has to be page size aligned. For
+ * simplicity, this might change.
+ */
+ return -EINVAL;
+ }
+
+ if ((addr + size) < addr)
+ return -EINVAL;
+
+ chunks = (unsigned int)div_u64(size, chunk_size);
+ if (chunks == 0)
+ return -EINVAL;
+
+ chunks_per_page = PAGE_SIZE / chunk_size;
+ if (chunks < chunks_per_page || chunks % chunks_per_page)
+ return -EINVAL;
+
+ headroom = ALIGN(headroom, 64);
+
+ size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ if (size_chk < 0)
+ return -EINVAL;
+
+ umem->pid = get_task_pid(current, PIDTYPE_PID);
+ umem->address = (unsigned long)addr;
+ umem->props.chunk_mask = ~((u64)chunk_size - 1);
+ umem->props.size = size;
+ umem->headroom = headroom;
+ umem->chunk_size_nohr = chunk_size - headroom;
+ umem->npgs = size / PAGE_SIZE;
+ umem->pgs = NULL;
+ umem->user = NULL;
+ INIT_LIST_HEAD(&umem->xsk_list);
+ spin_lock_init(&umem->xsk_list_lock);
+
+ refcount_set(&umem->users, 1);
+
+ err = xdp_umem_account_pages(umem);
+ if (err)
+ goto out;
+
+ err = xdp_umem_pin_pages(umem);
+ if (err)
+ goto out_account;
+
+ umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+ if (!umem->pages) {
+ err = -ENOMEM;
+ goto out_account;
+ }
+
+ for (i = 0; i < umem->npgs; i++)
+ umem->pages[i].addr = page_address(umem->pgs[i]);
+
+ return 0;
+
+out_account:
+ xdp_umem_unaccount_pages(umem);
+out:
+ put_pid(umem->pid);
+ return err;
+}
+
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
+{
+ struct xdp_umem *umem;
+ int err;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ err = xdp_umem_reg(umem, mr);
+ if (err) {
+ kfree(umem);
+ return ERR_PTR(err);
+ }
+
+ return umem;
+}
+
+bool xdp_umem_validate_queues(struct xdp_umem *umem)
+{
+ return umem->fq && umem->cq;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..f11560334f88
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <net/xdp_sock.h>
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
+
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+ u32 queue_id, u16 flags);
+bool xdp_umem_validate_queues(struct xdp_umem *umem);
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem);
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..40eab10dfc49
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+ u64 chunk_mask;
+ u64 size;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..c6ed2454f7ce
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,788 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ * Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <net/xdp_sock.h>
+#include <net/xdp.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+
+#define TX_BATCH_SIZE 16
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+ return (struct xdp_sock *)sk;
+}
+
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+ return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
+ READ_ONCE(xs->umem->fq);
+}
+
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+ return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+ xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ void *buffer;
+ u64 addr;
+ int err;
+
+ if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ len > xs->umem->chunk_size_nohr) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ addr += xs->umem->headroom;
+
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ memcpy(buffer, xdp->data, len);
+ err = xskq_produce_batch_desc(xs->rx, addr, len);
+ if (!err) {
+ xskq_discard_addr(xs->umem->fq);
+ xdp_return_buff(xdp);
+ return 0;
+ }
+
+ xs->rx_dropped++;
+ return err;
+}
+
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
+
+ if (err) {
+ xdp_return_buff(xdp);
+ xs->rx_dropped++;
+ }
+
+ return err;
+}
+
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len;
+
+ if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+ return -EINVAL;
+
+ len = xdp->data_end - xdp->data;
+
+ return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+ __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
+void xsk_flush(struct xdp_sock *xs)
+{
+ xskq_produce_flush_desc(xs->rx);
+ xs->sk.sk_data_ready(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len = xdp->data_end - xdp->data;
+ void *buffer;
+ u64 addr;
+ int err;
+
+ if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ len > xs->umem->chunk_size_nohr) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ addr += xs->umem->headroom;
+
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ memcpy(buffer, xdp->data, len);
+ err = xskq_produce_batch_desc(xs->rx, addr, len);
+ if (!err) {
+ xskq_discard_addr(xs->umem->fq);
+ xsk_flush(xs);
+ return 0;
+ }
+
+ xs->rx_dropped++;
+ return err;
+}
+
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+ xskq_produce_flush_addr_n(umem->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_umem_complete_tx);
+
+void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->sk.sk_write_space(&xs->sk);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+ struct xdp_desc desc;
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ if (!xskq_peek_desc(xs->tx, &desc))
+ continue;
+
+ if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+ goto out;
+
+ *dma = xdp_umem_get_dma(umem, desc.addr);
+ *len = desc.len;
+
+ xskq_discard_desc(xs->tx);
+ rcu_read_unlock();
+ return true;
+ }
+
+out:
+ rcu_read_unlock();
+ return false;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx);
+
+static int xsk_zc_xmit(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net_device *dev = xs->dev;
+
+ return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+}
+
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+ u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
+ struct xdp_sock *xs = xdp_sk(skb->sk);
+
+ WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+
+ sock_wfree(skb);
+}
+
+static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
+ size_t total_len)
+{
+ u32 max_batch = TX_BATCH_SIZE;
+ struct xdp_sock *xs = xdp_sk(sk);
+ bool sent_frame = false;
+ struct xdp_desc desc;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (unlikely(!xs->tx))
+ return -ENOBUFS;
+
+ mutex_lock(&xs->mutex);
+
+ while (xskq_peek_desc(xs->tx, &desc)) {
+ char *buffer;
+ u64 addr;
+ u32 len;
+
+ if (max_batch-- == 0) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ if (xskq_reserve_addr(xs->umem->cq)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ len = desc.len;
+ if (unlikely(len > xs->dev->mtu)) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ if (xs->queue_id >= xs->dev->real_num_tx_queues) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ skb = sock_alloc_send_skb(sk, len, 1, &err);
+ if (unlikely(!skb)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ skb_put(skb, len);
+ addr = desc.addr;
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ skb->dev = xs->dev;
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb->destructor = xsk_destruct_skb;
+
+ err = dev_direct_xmit(skb, xs->queue_id);
+ /* Ignore NET_XMIT_CN as packet might have been sent */
+ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
+ err = -EAGAIN;
+ /* SKB consumed by dev_direct_xmit() */
+ goto out;
+ }
+
+ sent_frame = true;
+ xskq_discard_desc(xs->tx);
+ }
+
+out:
+ if (sent_frame)
+ sk->sk_write_space(sk);
+
+ mutex_unlock(&xs->mutex);
+ return err;
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+ bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (unlikely(!xs->dev))
+ return -ENXIO;
+ if (unlikely(!(xs->dev->flags & IFF_UP)))
+ return -ENETDOWN;
+ if (need_wait)
+ return -EOPNOTSUPP;
+
+ return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
+}
+
+static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
+{
+ __poll_t mask = datagram_poll_mask(sock, events);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (xs->rx && !xskq_empty_desc(xs->rx))
+ mask |= POLLIN | POLLRDNORM;
+ if (xs->tx && !xskq_full_desc(xs->tx))
+ mask |= POLLOUT | POLLWRNORM;
+
+ return mask;
+}
+
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+ bool umem_queue)
+{
+ struct xsk_queue *q;
+
+ if (entries == 0 || *queue || !is_power_of_2(entries))
+ return -EINVAL;
+
+ q = xskq_create(entries, umem_queue);
+ if (!q)
+ return -ENOMEM;
+
+ /* Make sure queue is ready before it can be seen by others */
+ smp_wmb();
+ *queue = q;
+ return 0;
+}
+
+static int xsk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ local_bh_enable();
+
+ if (xs->dev) {
+ /* Wait for driver to stop using the xdp socket. */
+ synchronize_net();
+ dev_put(xs->dev);
+ xs->dev = NULL;
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ sk_refcnt_debug_release(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+ struct socket *sock;
+ int err;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return ERR_PTR(-ENOTSOCK);
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return ERR_PTR(-ENOPROTOOPT);
+ }
+
+ return sock;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net_device *dev;
+ u32 flags, qid;
+ int err = 0;
+
+ if (addr_len < sizeof(struct sockaddr_xdp))
+ return -EINVAL;
+ if (sxdp->sxdp_family != AF_XDP)
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->dev) {
+ err = -EBUSY;
+ goto out_release;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_release;
+ }
+
+ if (!xs->rx && !xs->tx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ qid = sxdp->sxdp_queue_id;
+
+ if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+ (xs->tx && qid >= dev->real_num_tx_queues)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ flags = sxdp->sxdp_flags;
+
+ if (flags & XDP_SHARED_UMEM) {
+ struct xdp_sock *umem_xs;
+ struct socket *sock;
+
+ if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+ /* Cannot specify flags for shared sockets. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (xs->umem) {
+ /* We have already our own. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+ if (IS_ERR(sock)) {
+ err = PTR_ERR(sock);
+ goto out_unlock;
+ }
+
+ umem_xs = xdp_sk(sock->sk);
+ if (!umem_xs->umem) {
+ /* No umem to inherit. */
+ err = -EBADF;
+ sockfd_put(sock);
+ goto out_unlock;
+ } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ xdp_get_umem(umem_xs->umem);
+ xs->umem = umem_xs->umem;
+ sockfd_put(sock);
+ } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+ err = -EINVAL;
+ goto out_unlock;
+ } else {
+ /* This xsk has its own umem. */
+ xskq_set_umem(xs->umem->fq, &xs->umem->props);
+ xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+ err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+ if (err)
+ goto out_unlock;
+ }
+
+ xs->dev = dev;
+ xs->zc = xs->umem->zc;
+ xs->queue_id = qid;
+ xskq_set_umem(xs->rx, &xs->umem->props);
+ xskq_set_umem(xs->tx, &xs->umem->props);
+ xdp_add_sk_umem(xs->umem, xs);
+
+out_unlock:
+ if (err)
+ dev_put(dev);
+out_release:
+ mutex_unlock(&xs->mutex);
+ return err;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int err;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ switch (optname) {
+ case XDP_RX_RING:
+ case XDP_TX_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (optlen < sizeof(entries))
+ return -EINVAL;
+ if (copy_from_user(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
+ err = xsk_init_queue(entries, q, false);
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ case XDP_UMEM_REG:
+ {
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+
+ if (copy_from_user(&mr, optval, sizeof(mr)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (xs->umem) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+
+ umem = xdp_umem_create(&mr);
+ if (IS_ERR(umem)) {
+ mutex_unlock(&xs->mutex);
+ return PTR_ERR(umem);
+ }
+
+ /* Make sure umem is ready before it can be seen by others */
+ smp_wmb();
+ xs->umem = umem;
+ mutex_unlock(&xs->mutex);
+ return 0;
+ }
+ case XDP_UMEM_FILL_RING:
+ case XDP_UMEM_COMPLETION_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (copy_from_user(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (!xs->umem) {
+ mutex_unlock(&xs->mutex);
+ return -EINVAL;
+ }
+
+ q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
+ &xs->umem->cq;
+ err = xsk_init_queue(entries, q, true);
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ default:
+ break;
+ }
+
+ return -ENOPROTOOPT;
+}
+
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int len;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case XDP_STATISTICS:
+ {
+ struct xdp_statistics stats;
+
+ if (len < sizeof(stats))
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ stats.rx_dropped = xs->rx_dropped;
+ stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+ stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+ mutex_unlock(&xs->mutex);
+
+ if (copy_to_user(optval, &stats, sizeof(stats)))
+ return -EFAULT;
+ if (put_user(sizeof(stats), optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ case XDP_MMAP_OFFSETS:
+ {
+ struct xdp_mmap_offsets off;
+
+ if (len < sizeof(off))
+ return -EINVAL;
+
+ off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
+ off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
+
+ off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ off.fr.desc = offsetof(struct xdp_umem_ring, desc);
+ off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ off.cr.desc = offsetof(struct xdp_umem_ring, desc);
+
+ len = sizeof(off);
+ if (copy_to_user(optval, &off, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ default:
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int xsk_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct xdp_sock *xs = xdp_sk(sock->sk);
+ struct xsk_queue *q = NULL;
+ struct xdp_umem *umem;
+ unsigned long pfn;
+ struct page *qpg;
+
+ if (offset == XDP_PGOFF_RX_RING) {
+ q = READ_ONCE(xs->rx);
+ } else if (offset == XDP_PGOFF_TX_RING) {
+ q = READ_ONCE(xs->tx);
+ } else {
+ umem = READ_ONCE(xs->umem);
+ if (!umem)
+ return -EINVAL;
+
+ if (offset == XDP_UMEM_PGOFF_FILL_RING)
+ q = READ_ONCE(umem->fq);
+ else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+ q = READ_ONCE(umem->cq);
+ }
+
+ if (!q)
+ return -EINVAL;
+
+ qpg = virt_to_head_page(q->ring);
+ if (size > (PAGE_SIZE << compound_order(qpg)))
+ return -EINVAL;
+
+ pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+ return remap_pfn_range(vma, vma->vm_start, pfn,
+ size, vma->vm_page_prot);
+}
+
+static struct proto xsk_proto = {
+ .name = "XDP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+ .family = PF_XDP,
+ .owner = THIS_MODULE,
+ .release = xsk_release,
+ .bind = xsk_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll_mask = xsk_poll_mask,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = xsk_setsockopt,
+ .getsockopt = xsk_getsockopt,
+ .sendmsg = xsk_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = xsk_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ xskq_destroy(xs->rx);
+ xskq_destroy(xs->tx);
+ xdp_del_sk_umem(xs->umem, xs);
+ xdp_put_umem(xs->umem);
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct xdp_sock *xs;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+ if (!sk)
+ return -ENOBUFS;
+
+ sock->ops = &xsk_proto_ops;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_family = PF_XDP;
+
+ sk->sk_destruct = xsk_destruct;
+ sk_refcnt_debug_inc(sk);
+
+ xs = xdp_sk(sk);
+ mutex_init(&xs->mutex);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, &xsk_proto, 1);
+ local_bh_enable();
+
+ return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+ .family = PF_XDP,
+ .create = xsk_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+ int err;
+
+ err = proto_register(&xsk_proto, 0 /* no slab */);
+ if (err)
+ goto out;
+
+ err = sock_register(&xsk_family_ops);
+ if (err)
+ goto out_proto;
+
+ return 0;
+
+out_proto:
+ proto_unregister(&xsk_proto);
+out:
+ return err;
+}
+
+fs_initcall(xsk_init);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..6c32e92e98fc
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+{
+ if (!q)
+ return;
+
+ q->umem_props = *umem_props;
+}
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+ return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
+}
+
+static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
+{
+ return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
+{
+ struct xsk_queue *q;
+ gfp_t gfp_flags;
+ size_t size;
+
+ q = kzalloc(sizeof(*q), GFP_KERNEL);
+ if (!q)
+ return NULL;
+
+ q->nentries = nentries;
+ q->ring_mask = nentries - 1;
+
+ gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+ __GFP_COMP | __GFP_NORETRY;
+ size = umem_queue ? xskq_umem_get_ring_size(q) :
+ xskq_rxtx_get_ring_size(q);
+
+ q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+ get_order(size));
+ if (!q->ring) {
+ kfree(q);
+ return NULL;
+ }
+
+ return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+ if (!q)
+ return;
+
+ page_frag_free(q->ring);
+ kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..ef6a6f0ec949
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+#include <net/xdp_sock.h>
+
+#define RX_BATCH_SIZE 16
+#define LAZY_UPDATE_THRESHOLD 128
+
+struct xdp_ring {
+ u32 producer ____cacheline_aligned_in_smp;
+ u32 consumer ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+ struct xdp_ring ptrs;
+ struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+ struct xdp_ring ptrs;
+ u64 desc[0] ____cacheline_aligned_in_smp;
+};
+
+struct xsk_queue {
+ struct xdp_umem_props umem_props;
+ u32 ring_mask;
+ u32 nentries;
+ u32 prod_head;
+ u32 prod_tail;
+ u32 cons_head;
+ u32 cons_tail;
+ struct xdp_ring *ring;
+ u64 invalid_descs;
+};
+
+/* Common functions operating for both RXTX and umem queues */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+ return q ? q->invalid_descs : 0;
+}
+
+static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
+{
+ u32 entries = q->prod_tail - q->cons_tail;
+
+ if (entries == 0) {
+ /* Refresh the local pointer */
+ q->prod_tail = READ_ONCE(q->ring->producer);
+ entries = q->prod_tail - q->cons_tail;
+ }
+
+ return (entries > dcnt) ? dcnt : entries;
+}
+
+static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
+{
+ return q->nentries - (producer - q->cons_tail);
+}
+
+static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
+{
+ u32 free_entries = xskq_nb_free_lazy(q, producer);
+
+ if (free_entries >= dcnt)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cons_tail = READ_ONCE(q->ring->consumer);
+ return q->nentries - (producer - q->cons_tail);
+}
+
+/* UMEM queue */
+
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
+{
+ if (addr >= q->umem_props.size) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+{
+ while (q->cons_tail != q->cons_head) {
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ unsigned int idx = q->cons_tail & q->ring_mask;
+
+ *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+ if (xskq_is_valid_addr(q, *addr))
+ return addr;
+
+ q->cons_tail++;
+ }
+
+ return NULL;
+}
+
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+{
+ if (q->cons_tail == q->cons_head) {
+ WRITE_ONCE(q->ring->consumer, q->cons_tail);
+ q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+ /* Order consumer and data */
+ smp_rmb();
+ }
+
+ return xskq_validate_addr(q, addr);
+}
+
+static inline void xskq_discard_addr(struct xsk_queue *q)
+{
+ q->cons_tail++;
+}
+
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+ return -ENOSPC;
+
+ ring->desc[q->prod_tail++ & q->ring_mask] = addr;
+
+ /* Order producer and data */
+ smp_wmb();
+
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+ return 0;
+}
+
+static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
+ return -ENOSPC;
+
+ ring->desc[q->prod_head++ & q->ring_mask] = addr;
+ return 0;
+}
+
+static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
+ u32 nb_entries)
+{
+ /* Order producer and data */
+ smp_wmb();
+
+ q->prod_tail += nb_entries;
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline int xskq_reserve_addr(struct xsk_queue *q)
+{
+ if (xskq_nb_free(q, q->prod_head, 1) == 0)
+ return -ENOSPC;
+
+ q->prod_head++;
+ return 0;
+}
+
+/* Rx/Tx queue */
+
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+{
+ if (!xskq_is_valid_addr(q, d->addr))
+ return false;
+
+ if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+ (d->addr & q->umem_props.chunk_mask)) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+}
+
+static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
+ struct xdp_desc *desc)
+{
+ while (q->cons_tail != q->cons_head) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ unsigned int idx = q->cons_tail & q->ring_mask;
+
+ *desc = READ_ONCE(ring->desc[idx]);
+ if (xskq_is_valid_desc(q, desc))
+ return desc;
+
+ q->cons_tail++;
+ }
+
+ return NULL;
+}
+
+static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
+ struct xdp_desc *desc)
+{
+ if (q->cons_tail == q->cons_head) {
+ WRITE_ONCE(q->ring->consumer, q->cons_tail);
+ q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+ /* Order consumer and data */
+ smp_rmb();
+ }
+
+ return xskq_validate_desc(q, desc);
+}
+
+static inline void xskq_discard_desc(struct xsk_queue *q)
+{
+ q->cons_tail++;
+}
+
+static inline int xskq_produce_batch_desc(struct xsk_queue *q,
+ u64 addr, u32 len)
+{
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ unsigned int idx;
+
+ if (xskq_nb_free(q, q->prod_head, 1) == 0)
+ return -ENOSPC;
+
+ idx = (q->prod_head++) & q->ring_mask;
+ ring->desc[idx].addr = addr;
+ ring->desc[idx].len = len;
+
+ return 0;
+}
+
+static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+{
+ /* Order producer and data */
+ smp_wmb();
+
+ q->prod_tail = q->prod_head,
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline bool xskq_full_desc(struct xsk_queue *q)
+{
+ return xskq_nb_avail(q, q->nentries) == q->nentries;
+}
+
+static inline bool xskq_empty_desc(struct xsk_queue *q)
+{
+ return xskq_nb_free(q, q->prod_tail, 1) == q->nentries;
+}
+
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
+void xskq_destroy(struct xsk_queue *q_ops);
+
+#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 6c177ae7a6d9..8308281f3253 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -42,6 +42,7 @@ static void xfrm_state_gc_task(struct work_struct *work);
static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
+static struct kmem_cache *xfrm_state_cache __ro_after_init;
static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
static HLIST_HEAD(xfrm_state_gc_list);
@@ -451,7 +452,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
}
xfrm_dev_state_free(x);
security_xfrm_state_free(x);
- kfree(x);
+ kmem_cache_free(xfrm_state_cache, x);
}
static void xfrm_state_gc_task(struct work_struct *work)
@@ -563,7 +564,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
{
struct xfrm_state *x;
- x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
+ x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
if (x) {
write_pnet(&x->xs_net, net);
@@ -2313,6 +2314,10 @@ int __net_init xfrm_state_init(struct net *net)
{
unsigned int sz;
+ if (net_eq(net, &init_net))
+ xfrm_state_cache = KMEM_CACHE(xfrm_state,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
INIT_LIST_HEAD(&net->xfrm.state_all);
sz = sizeof(struct hlist_head) * 8;