diff options
Diffstat (limited to 'net')
116 files changed, 3528 insertions, 2907 deletions
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 69244e4598f5..1dd70f048e7b 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -674,7 +674,7 @@ static void batadv_mcast_mla_update(struct work_struct *work) */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { - if (ip_mc_check_igmp(skb, NULL) < 0) + if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { @@ -741,7 +741,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { - if (ipv6_mc_check_mld(skb, NULL) < 0) + if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fa2644d276ef..2c5172b33209 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -240,3 +240,85 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct bpf_flow_keys flow_keys; + u64 time_start, time_spent = 0; + struct bpf_skb_data_end *cb; + u32 retval, duration; + struct sk_buff *skb; + struct sock *sk; + void *data; + int ret; + u32 i; + + if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) + return -EINVAL; + + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (IS_ERR(data)) + return PTR_ERR(data); + + sk = kzalloc(sizeof(*sk), GFP_USER); + if (!sk) { + kfree(data); + return -ENOMEM; + } + sock_net_set(sk, current->nsproxy->net_ns); + sock_init_data(NULL, sk); + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + kfree(sk); + return -ENOMEM; + } + skb->sk = sk; + + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + __skb_put(skb, size); + skb->protocol = eth_type_trans(skb, + current->nsproxy->net_ns->loopback_dev); + skb_reset_network_header(skb); + + cb = (struct bpf_skb_data_end *)skb->cb; + cb->qdisc_cb.flow_keys = &flow_keys; + + if (!repeat) + repeat = 1; + + time_start = ktime_get_ns(); + for (i = 0; i < repeat; i++) { + preempt_disable(); + rcu_read_lock(); + retval = __skb_flow_bpf_dissect(prog, skb, + &flow_keys_dissector, + &flow_keys); + rcu_read_unlock(); + preempt_enable(); + + if (need_resched()) { + if (signal_pending(current)) + break; + time_spent += ktime_get_ns() - time_start; + cond_resched(); + time_start = ktime_get_ns(); + } + } + time_spent += ktime_get_ns() - time_start; + do_div(time_spent, repeat); + duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + + ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), + retval, duration); + + kfree_skb(skb); + kfree(sk); + return ret; +} diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 9e14767500ea..00573cc46c98 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -915,7 +915,8 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, u16 nlh_flags) + const unsigned char *addr, u16 vid, u16 nlh_flags, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3aeff0895669..780757b7a82f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -14,6 +14,7 @@ #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> +#include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> @@ -29,6 +30,7 @@ #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) +#include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> @@ -938,7 +940,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, for (i = 0; i < num; i++) { len += sizeof(*grec); - if (!pskb_may_pull(skb, len)) + if (!ip_mc_may_pull(skb, len)) return -EINVAL; grec = (void *)(skb->data + len - sizeof(*grec)); @@ -946,7 +948,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, type = grec->grec_type; len += ntohs(grec->grec_nsrcs) * 4; - if (!pskb_may_pull(skb, len)) + if (!ip_mc_may_pull(skb, len)) return -EINVAL; /* We treat this as an IGMPv2 report for now. */ @@ -985,15 +987,17 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + unsigned int nsrcs_offset; const unsigned char *src; struct icmp6hdr *icmp6h; struct mld2_grec *grec; + unsigned int grec_len; int i; int len; int num; int err = 0; - if (!pskb_may_pull(skb, sizeof(*icmp6h))) + if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h))) return -EINVAL; icmp6h = icmp6_hdr(skb); @@ -1003,21 +1007,25 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; - nsrcs = skb_header_pointer(skb, - len + offsetof(struct mld2_grec, - grec_nsrcs), + nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); + + if (skb_transport_offset(skb) + ipv6_transport_len(skb) < + nsrcs_offset + sizeof(_nsrcs)) + return -EINVAL; + + nsrcs = skb_header_pointer(skb, nsrcs_offset, sizeof(_nsrcs), &_nsrcs); if (!nsrcs) return -EINVAL; - if (!pskb_may_pull(skb, - len + sizeof(*grec) + - sizeof(struct in6_addr) * ntohs(*nsrcs))) + grec_len = sizeof(*grec) + + sizeof(struct in6_addr) * ntohs(*nsrcs); + + if (!ipv6_mc_may_pull(skb, len + grec_len)) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); - len += sizeof(*grec) + - sizeof(struct in6_addr) * ntohs(*nsrcs); + len += grec_len; /* We treat these as MLDv1 reports for now. */ switch (grec->grec_type) { @@ -1219,6 +1227,7 @@ static void br_ip4_multicast_query(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + unsigned int transport_len = ip_transport_len(skb); const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); struct net_bridge_mdb_entry *mp; @@ -1228,7 +1237,6 @@ static void br_ip4_multicast_query(struct net_bridge *br, struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; - unsigned int offset = skb_transport_offset(skb); __be32 group; spin_lock(&br->multicast_lock); @@ -1238,14 +1246,14 @@ static void br_ip4_multicast_query(struct net_bridge *br, group = ih->group; - if (skb->len == offset + sizeof(*ih)) { + if (transport_len == sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } - } else if (skb->len >= offset + sizeof(*ih3)) { + } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; @@ -1296,6 +1304,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + unsigned int transport_len = ipv6_transport_len(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; @@ -1315,7 +1324,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - if (skb->len == offset + sizeof(*mld)) { + if (transport_len == sizeof(*mld)) { if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; @@ -1576,17 +1585,29 @@ static void br_multicast_pim(struct net_bridge *br, br_multicast_mark_router(br, port); } +static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + if (ip_hdr(skb)->protocol != IPPROTO_IGMP || + igmp_hdr(skb)->type != IGMP_MRDISC_ADV) + return -ENOMSG; + + br_multicast_mark_router(br, port); + + return 0; +} + static int br_multicast_ipv4_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb_trimmed = NULL; const unsigned char *src; struct igmphdr *ih; int err; - err = ip_mc_check_igmp(skb, &skb_trimmed); + err = ip_mc_check_igmp(skb); if (err == -ENOMSG) { if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) { @@ -1594,7 +1615,15 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) br_multicast_pim(br, port, skb); + } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { + err = br_ip4_multicast_mrd_rcv(br, port, skb); + + if (err < 0 && err != -ENOMSG) { + br_multicast_err_count(br, port, skb->protocol); + return err; + } } + return 0; } else if (err < 0) { br_multicast_err_count(br, port, skb->protocol); @@ -1612,19 +1641,16 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid, src); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - br_ip4_multicast_query(br, port, skb_trimmed, vid); + br_ip4_multicast_query(br, port, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid, src); break; } - if (skb_trimmed && skb_trimmed != skb) - kfree_skb(skb_trimmed); - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); @@ -1632,21 +1658,51 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } #if IS_ENABLED(CONFIG_IPV6) +static int br_ip6_multicast_mrd_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + int ret; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + ret = ipv6_mc_check_icmpv6(skb); + if (ret < 0) + return ret; + + if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) + return -ENOMSG; + + br_multicast_mark_router(br, port); + + return 0; +} + static int br_multicast_ipv6_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb_trimmed = NULL; const unsigned char *src; struct mld_msg *mld; int err; - err = ipv6_mc_check_mld(skb, &skb_trimmed); + err = ipv6_mc_check_mld(skb); if (err == -ENOMSG) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; + + if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) { + err = br_ip6_multicast_mrd_rcv(br, port, skb); + + if (err < 0 && err != -ENOMSG) { + br_multicast_err_count(br, port, skb->protocol); + return err; + } + } + return 0; } else if (err < 0) { br_multicast_err_count(br, port, skb->protocol); @@ -1664,10 +1720,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, src); break; case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); + err = br_ip6_multicast_mld2_report(br, port, skb, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb_trimmed, vid); + err = br_ip6_multicast_query(br, port, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; @@ -1675,9 +1731,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, break; } - if (skb_trimmed && skb_trimmed != skb) - kfree_skb(skb_trimmed); - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); @@ -1781,6 +1834,68 @@ void br_multicast_init(struct net_bridge *br) INIT_HLIST_HEAD(&br->mdb_list); } +static void br_ip4_multicast_join_snoopers(struct net_bridge *br) +{ + struct in_device *in_dev = in_dev_get(br->dev); + + if (!in_dev) + return; + + ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP)); + in_dev_put(in_dev); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_join_snoopers(struct net_bridge *br) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); + ipv6_dev_mc_inc(br->dev, &addr); +} +#else +static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) +{ +} +#endif + +static void br_multicast_join_snoopers(struct net_bridge *br) +{ + br_ip4_multicast_join_snoopers(br); + br_ip6_multicast_join_snoopers(br); +} + +static void br_ip4_multicast_leave_snoopers(struct net_bridge *br) +{ + struct in_device *in_dev = in_dev_get(br->dev); + + if (WARN_ON(!in_dev)) + return; + + ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP)); + in_dev_put(in_dev); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_leave_snoopers(struct net_bridge *br) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); + ipv6_dev_mc_dec(br->dev, &addr); +} +#else +static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) +{ +} +#endif + +static void br_multicast_leave_snoopers(struct net_bridge *br) +{ + br_ip4_multicast_leave_snoopers(br); + br_ip6_multicast_leave_snoopers(br); +} + static void __br_multicast_open(struct net_bridge *br, struct bridge_mcast_own_query *query) { @@ -1794,6 +1909,9 @@ static void __br_multicast_open(struct net_bridge *br, void br_multicast_open(struct net_bridge *br) { + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + __br_multicast_open(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open(br, &br->ip6_own_query); @@ -1809,6 +1927,9 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif + + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_leave_snoopers(br); } void br_multicast_dev_del(struct net_bridge *br) @@ -1944,8 +2065,10 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) br_mc_disabled_update(br->dev, val); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { + br_multicast_leave_snoopers(br); goto unlock; + } if (!netif_running(br->dev)) goto unlock; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c93c35bb73dd..40d058378b52 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -881,11 +881,6 @@ static const struct nf_br_ops br_ops = { .br_dev_xmit_hook = br_nf_dev_xmit, }; -void br_netfilter_enable(void) -{ -} -EXPORT_SYMBOL_GPL(br_netfilter_enable); - /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static const struct nf_hook_ops br_nf_ops[] = { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index eabf8bf28a3f..00deef7fc1f3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -573,7 +573,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, u16 nlh_flags); + const unsigned char *addr, u16 vid, u16 nlh_flags, + struct netlink_ext_ack *extack); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 035ff59d9cbd..4d2b9eb7604a 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -113,7 +113,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, info.added_by_user = added_by_user; info.offloaded = offloaded; notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; - call_switchdev_notifiers(notifier_type, dev, &info.info); + call_switchdev_notifiers(notifier_type, dev, &info.info, NULL); } void diff --git a/net/core/dst.c b/net/core/dst.c index 81ccf20e2826..a263309df115 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -98,8 +98,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, struct dst_entry *dst; if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) + if (ops->gc(ops)) { + printk_ratelimited(KERN_NOTICE "Route cache is full: " + "consider increasing sysctl " + "net.ipv[4|6].route.max_size.\n"); return NULL; + } } dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); diff --git a/net/core/filter.c b/net/core/filter.c index 7559d6835ecb..41984ad4b9b4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6708,6 +6708,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct __sk_buff, gso_segs): + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); @@ -7698,6 +7719,7 @@ const struct bpf_verifier_ops flow_dissector_verifier_ops = { }; const struct bpf_prog_ops flow_dissector_prog_ops = { + .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 9f2840510e63..bb1a54747d64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -683,6 +683,46 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys) +{ + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(*cb)); + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->nhoff = skb_network_offset(skb); + flow_keys->thoff = flow_keys->nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(prog, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, skb->len); + + return result == BPF_OK; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -714,7 +754,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; - struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -754,53 +793,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - rcu_read_lock(); if (skb) { + struct bpf_flow_keys flow_keys; + struct bpf_prog *attached = NULL; + + rcu_read_lock(); + if (skb->dev) attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); else if (skb->sk) attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); else WARN_ON_ONCE(1); - } - if (attached) { - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - struct bpf_flow_keys flow_keys = {}; - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; - u32 result; - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(cb_saved)); - /* Pass parameters to the BPF program */ - cb->qdisc_cb.flow_keys = &flow_keys; - flow_keys.nhoff = nhoff; - flow_keys.thoff = nhoff; - - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(attached, skb); - - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); - flow_keys.thoff = clamp_t(u16, flow_keys.thoff, - flow_keys.nhoff, skb->len); - - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); + if (attached) { + ret = __skb_flow_bpf_dissect(attached, skb, + flow_dissector, + &flow_keys); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return ret; + } rcu_read_unlock(); - return result == BPF_OK; } - rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b02fb19df2cc..17f36317363d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -778,6 +778,41 @@ nla_put_failure: return -EMSGSIZE; } +static int rtnl_net_valid_getid_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETNSA_PID: + case NETNSA_FD: + case NETNSA_NSID: + case NETNSA_TARGET_NSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -793,8 +828,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int err; - err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy, extack); + err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); if (err < 0) return err; if (tb[NETNSA_PID]) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ea1bed08ede..f5a98082ac7a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3242,6 +3242,53 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } +static int rtnl_valid_getlink_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for get link"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, + extack); + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, + extack); + if (err) + return err; + + for (i = 0; i <= IFLA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_IFNAME: + case IFLA_EXT_MASK: + case IFLA_TARGET_NETNSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -3256,7 +3303,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u32 ext_filter_mask = 0; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; @@ -3639,7 +3686,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, const struct net_device_ops *ops = br_dev->netdev_ops; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags); + nlh->nlmsg_flags, extack); if (err) goto out; else @@ -3651,7 +3698,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags); + nlh->nlmsg_flags, + extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); @@ -4901,6 +4949,40 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, return size; } +static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, + bool is_dump, struct netlink_ext_ack *extack) +{ + struct if_stats_msg *ifsm; + + if (nlh->nlmsg_len < sizeof(*ifsm)) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); + return -EINVAL; + } + + if (!strict_check) + return 0; + + ifsm = nlmsg_data(nlh); + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { + NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); + return -EINVAL; + } + if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { + NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); + return -EINVAL; + } + + return 0; +} + static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -4912,8 +4994,10 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, u32 filter_mask; int err; - if (nlmsg_len(nlh) < sizeof(*ifsm)) - return -EINVAL; + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) @@ -4965,27 +5049,11 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; - if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) { - NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); - return -EINVAL; - } + err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); + if (err) + return err; ifsm = nlmsg_data(cb->nlh); - - /* only requests using strict checks can pass data to influence - * the dump. The legacy exception is filter_mask. - */ - if (cb->strict_check) { - if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) { - NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); - return -EINVAL; - } - if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) { - NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); - return -EINVAL; - } - } - filter_mask = ifsm->filter_mask; if (!filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d6d5c20d7044..e76ed8df9f13 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -78,11 +78,9 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, { int i = src->sg.start; struct scatterlist *sge = sk_msg_elem(src, i); + struct scatterlist *sgd = NULL; u32 sge_len, sge_off; - if (sk_msg_full(dst)) - return -ENOSPC; - while (off) { if (sge->length > off) break; @@ -94,16 +92,27 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, } while (len) { - if (sk_msg_full(dst)) - return -ENOSPC; - sge_len = sge->length - off; - sge_off = sge->offset + off; if (sge_len > len) sge_len = len; + + if (dst->sg.end) + sgd = sk_msg_elem(dst, dst->sg.end - 1); + + if (sgd && + (sg_page(sge) == sg_page(sgd)) && + (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { + sgd->length += sge_len; + dst->sg.size += sge_len; + } else if (!sk_msg_full(dst)) { + sge_off = sge->offset + off; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + } else { + return -ENOSPC; + } + off = 0; len -= sge_len; - sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); sk_mem_charge(sk, sge_len); sk_msg_iter_var_next(i); if (i == src->sg.end && len) diff --git a/net/core/sock.c b/net/core/sock.c index 6aa2e7e0b4fb..900e8a9435f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -520,14 +520,11 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, - int optlen) +static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); - char devname[IFNAMSIZ]; - int index; /* Sorry... */ ret = -EPERM; @@ -535,6 +532,32 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; ret = -EINVAL; + if (ifindex < 0) + goto out; + + sk->sk_bound_dev_if = ifindex; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + sk_dst_reset(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +static int sock_setbindtodevice(struct sock *sk, char __user *optval, + int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + ret = -EINVAL; if (optlen < 0) goto out; @@ -566,14 +589,9 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, } lock_sock(sk); - sk->sk_bound_dev_if = index; - if (sk->sk_prot->rehash) - sk->sk_prot->rehash(sk); - sk_dst_reset(sk); + ret = sock_setbindtodevice_locked(sk, index); release_sock(sk); - ret = 0; - out: #endif @@ -1055,6 +1073,10 @@ set_rcvbuf: } break; + case SO_BINDTOIFINDEX: + ret = sock_setbindtodevice_locked(sk, val); + break; + default: ret = -ENOPROTOOPT; break; @@ -1399,6 +1421,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, SOF_TXTIME_REPORT_ERRORS : 0; break; + case SO_BINDTOIFINDEX: + v.val = sk->sk_bound_dev_if; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1726,7 +1752,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); - atomic64_set(&newsk->sk_cookie, 0); if (likely(newsk->sk_net_refcnt)) sock_inuse_add(sock_net(newsk), 1); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d67ec17f2cc8..84bf2861f45f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -36,6 +36,15 @@ static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); +/* 0 - Keep current behavior: + * IPv4: inherit all current settings from init_net + * IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -544,6 +553,15 @@ static struct ctl_table net_core_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "devconf_inherit_init_net", + .data = &sysctl_devconf_inherit_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { } }; diff --git a/net/dccp/input.c b/net/dccp/input.c index 85d6c879383d..8d03707abdac 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -480,7 +480,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } - if (sk->sk_write_pending || icsk->icsk_ack.pingpong || + if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || icsk->icsk_accept_queue.rskq_defer_accept) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1501a20a94ca..74e138495d67 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -199,7 +199,7 @@ static void dccp_delack_timer(struct timer_list *t) icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { - if (!icsk->icsk_ack.pingpong) { + if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); @@ -207,7 +207,7 @@ static void dccp_delack_timer(struct timer_list *t) /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index f78fe58eafc8..6cd3737593a6 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -282,7 +282,7 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0) goto err_inval; - fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL); + fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); err = -ENOBUFS; if (fi == NULL) goto failure; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 026a05774bf7..1f4972dab9f2 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -103,7 +103,8 @@ static inline void dsa_legacy_unregister(void) { } int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags); + u16 flags, + struct netlink_ext_ack *extack); int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); diff --git a/net/dsa/master.c b/net/dsa/master.c index 71bb15f491c8..79e97d2f2d9b 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -126,6 +126,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, } } +static int dsa_master_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + + if (snprintf(name, len, "p%d", cpu_dp->index) >= len) + return -EINVAL; + + return 0; +} + static int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -158,6 +169,38 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) cpu_dp->orig_ethtool_ops = NULL; } +static int dsa_master_ndo_setup(struct net_device *dev) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; + struct net_device_ops *ops; + + if (dev->netdev_ops->ndo_get_phys_port_name) + return 0; + + ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + cpu_dp->orig_ndo_ops = dev->netdev_ops; + if (cpu_dp->orig_ndo_ops) + memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops)); + + ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name; + + dev->netdev_ops = ops; + + return 0; +} + +static void dsa_master_ndo_teardown(struct net_device *dev) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + + dev->netdev_ops = cpu_dp->orig_ndo_ops; + cpu_dp->orig_ndo_ops = NULL; +} + static ssize_t tagging_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -223,16 +266,27 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) if (ret) return ret; + ret = dsa_master_ndo_setup(dev); + if (ret) + goto out_err_ethtool_teardown; + ret = sysfs_create_group(&dev->dev.kobj, &dsa_group); if (ret) - dsa_master_ethtool_teardown(dev); + goto out_err_ndo_teardown; + + return ret; +out_err_ndo_teardown: + dsa_master_ndo_teardown(dev); +out_err_ethtool_teardown: + dsa_master_ethtool_teardown(dev); return ret; } void dsa_master_teardown(struct net_device *dev) { sysfs_remove_group(&dev->dev.kobj, &dsa_group); + dsa_master_ndo_teardown(dev); dsa_master_ethtool_teardown(dev); dsa_master_reset_mtu(dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a3fcc1d01615..91de3a663226 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1009,7 +1009,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags) + u16 flags, + struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1450,7 +1451,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) } fdb_info->offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev, - &fdb_info->info); + &fdb_info->info, NULL); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e258a00b4a3d..cd9033245b98 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2063,13 +2063,49 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; +static int inet_netconf_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETCONFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETCONFA_IFINDEX: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; - struct netconfmsg *ncm; struct sk_buff *skb; struct ipv4_devconf *devconf; struct in_device *in_dev; @@ -2077,9 +2113,8 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, int ifindex; int err; - err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv4_policy, extack); - if (err < 0) + err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); + if (err) goto errout; err = -EINVAL; @@ -2556,32 +2591,32 @@ static __net_init int devinet_init_net(struct net *net) int err; struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table *tbl; struct ctl_table_header *forw_hdr; #endif err = -ENOMEM; - all = &ipv4_devconf; - dflt = &ipv4_devconf_dflt; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); + if (!all) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); - if (!all) - goto err_alloc_all; - - dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); - if (!dflt) - goto err_alloc_dflt; + dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; #ifdef CONFIG_SYSCTL - tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); - if (!tbl) - goto err_alloc_ctl; + tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); + if (!tbl) + goto err_alloc_ctl; - tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; - tbl[0].extra1 = all; - tbl[0].extra2 = net; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; #endif + + if (sysctl_devconf_inherit_init_net != 2 && !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); } #ifdef CONFIG_SYSCTL @@ -2611,15 +2646,12 @@ err_reg_ctl: err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: - if (tbl != ctl_forward_entry) - kfree(tbl); + kfree(tbl); err_alloc_ctl: #endif - if (dflt != &ipv4_devconf_dflt) - kfree(dflt); + kfree(dflt); err_alloc_dflt: - if (all != &ipv4_devconf) - kfree(all); + kfree(all); err_alloc_all: return err; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 765b2b32c4a4..a40e48ded10d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1493,22 +1493,22 @@ static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) len += sizeof(struct igmpv3_report); - return pskb_may_pull(skb, len) ? 0 : -EINVAL; + return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { - unsigned int len = skb_transport_offset(skb); - - len += sizeof(struct igmphdr); - if (skb->len < len) - return -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + unsigned int len; /* IGMPv{1,2}? */ - if (skb->len != len) { + if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ - len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); - if (skb->len < len || !pskb_may_pull(skb, len)) + if (transport_len < sizeof(struct igmpv3_query)) + return -EINVAL; + + len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); + if (!ip_mc_may_pull(skb, len)) return -EINVAL; } @@ -1544,47 +1544,29 @@ static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_simple_validate(skb); } -static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) - +static int ip_mc_check_igmp_csum(struct sk_buff *skb) { - struct sk_buff *skb_chk; - unsigned int transport_len; unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); - int ret = -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + struct sk_buff *skb_chk; - transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + if (!ip_mc_may_pull(skb, len)) + return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) - goto err; - - if (!pskb_may_pull(skb_chk, len)) - goto err; - - ret = ip_mc_check_igmp_msg(skb_chk); - if (ret) - goto err; - - if (skb_trimmed) - *skb_trimmed = skb_chk; - /* free now unneeded clone */ - else if (skb_chk != skb) - kfree_skb(skb_chk); - - ret = 0; + return -EINVAL; -err: - if (ret && skb_chk && skb_chk != skb) + if (skb_chk != skb) kfree_skb(skb_chk); - return ret; + return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate - * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. @@ -1594,18 +1576,10 @@ err: * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * - * Optionally, an skb pointer might be provided via skb_trimmed (or set it - * to NULL): After parsing an IGMP packet successfully it will point to - * an skb which has its tail aligned to the IP packet end. This might - * either be the originally provided skb or a trimmed, cloned version if - * the skb frame had data beyond the IP packet. A cloned skb allows us - * to leave the original skb and its full frame unchanged (which might be - * desirable for layer 2 frame jugglers). - * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ -int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); @@ -1615,7 +1589,11 @@ int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; - return __ip_mc_check_igmp(skb, skb_trimmed); + ret = ip_mc_check_igmp_csum(skb); + if (ret < 0) + return ret; + + return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 760a9e52e02b..9f69411251d0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,62 @@ #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -123,6 +179,28 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -224,3 +302,218 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return fq; } EXPORT_SYMBOL(inet_frag_find); + +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) +{ + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). + */ + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (last->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < last->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == last->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = curr->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= curr->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= curr->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + } + + skb->ip_defrag_offset = offset; + + return IPFRAG_OK; +} +EXPORT_SYMBOL(inet_frag_queue_insert); + +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) +{ + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + struct sk_buff **nextp; + int delta; + + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) + return NULL; + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; + } + WARN_ON(head->ip_defrag_offset != 0); + + delta = -head->truesize; + + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + return NULL; + + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->net, delta); + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + return NULL; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; + } + + return nextp; +} +EXPORT_SYMBOL(inet_frag_reasm_prepare); + +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data) +{ + struct sk_buff **nextp = (struct sk_buff **)reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; + } + } + sub_frag_mem_limit(q->net, head->truesize); + + *nextp = NULL; + skb_mark_not_on_list(head); + head->prev = NULL; + head->tstamp = q->stamp; +} +EXPORT_SYMBOL(inet_frag_reasm_finish); + +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) +{ + struct sk_buff *head; + + if (q->fragments) { + head = q->fragments; + q->fragments = head->next; + } else { + struct sk_buff *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->net, head->truesize); + + return head; +} +EXPORT_SYMBOL(inet_frag_pull_head); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 867be8f7f1fa..486ecb0aeb87 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,57 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -/* Use skb->cb to track consecutive/adjacent fragments coming at - * the end of the queue. Nodes in the rb-tree queue will - * contain "runs" of one or more adjacent fragments. - * - * Invariants: - * - next_frag is NULL at the tail of a "run"; - * - the head of a "run" has the sum of all fragment lengths in frag_run_len. - */ -struct ipfrag_skb_cb { - struct inet_skb_parm h; - struct sk_buff *next_frag; - int frag_run_len; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - -static void ip4_frag_init_run(struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); - - FRAG_CB(skb)->next_frag = NULL; - FRAG_CB(skb)->frag_run_len = skb->len; -} - -/* Append skb to the last "run". */ -static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, - struct sk_buff *skb) -{ - RB_CLEAR_NODE(&skb->rbnode); - FRAG_CB(skb)->next_frag = NULL; - - FRAG_CB(q->last_run_head)->frag_run_len += skb->len; - FRAG_CB(q->fragments_tail)->next_frag = skb; - q->fragments_tail = skb; -} - -/* Create a new "run" with the skb. */ -static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) -{ - if (q->last_run_head) - rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, - &q->last_run_head->rbnode.rb_right); - else - rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); - rb_insert_color(&skb->rbnode, &q->rb_fragments); - - ip4_frag_init_run(skb); - q->fragments_tail = skb; - q->last_run_head = skb; -} - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t) * pull the head out of the tree in order to be able to * deal with head->dev. */ - if (qp->q.fragments) { - head = qp->q.fragments; - qp->q.fragments = head->next; - } else { - head = skb_rb_first(&qp->q.rb_fragments); - if (!head) - goto out; - if (FRAG_CB(head)->next_frag) - rb_replace_node(&head->rbnode, - &FRAG_CB(head)->next_frag->rbnode, - &qp->q.rb_fragments); - else - rb_erase(&head->rbnode, &qp->q.rb_fragments); - memset(&head->rbnode, 0, sizeof(head->rbnode)); - barrier(); - } - if (head == qp->q.fragments_tail) - qp->q.fragments_tail = NULL; - - sub_frag_mem_limit(qp->q.net, head->truesize); - + head = inet_frag_pull_head(&qp->q); + if (!head) + goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -344,12 +275,10 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; - struct sk_buff *skb1, *prev_tail; - int ihl, end, skb1_run_end; + int ihl, end, flags, offset; + struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; - int flags, offset; int err = -ENOENT; u8 ecn; @@ -413,62 +342,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Makes sure compiler wont do silly aliasing games */ barrier(); - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * - * We do the same here for IPv4 (and increment an snmp counter) but - * we do not want to drop the whole queue in response to a duplicate - * fragment. - */ - - err = -EINVAL; - /* Find out where to put this fragment. */ prev_tail = qp->q.fragments_tail; - if (!prev_tail) - ip4_frag_create_run(&qp->q, skb); /* First fragment. */ - else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { - /* This is the common case: skb goes to the end. */ - /* Detect and discard overlaps. */ - if (offset < prev_tail->ip_defrag_offset + prev_tail->len) - goto overlap; - if (offset == prev_tail->ip_defrag_offset + prev_tail->len) - ip4_frag_append_to_last_run(&qp->q, skb); - else - ip4_frag_create_run(&qp->q, skb); - } else { - /* Binary search. Note that skb can become the first fragment, - * but not the last (covered above). - */ - rbn = &qp->q.rb_fragments.rb_node; - do { - parent = *rbn; - skb1 = rb_to_skb(parent); - skb1_run_end = skb1->ip_defrag_offset + - FRAG_CB(skb1)->frag_run_len; - if (end <= skb1->ip_defrag_offset) - rbn = &parent->rb_left; - else if (offset >= skb1_run_end) - rbn = &parent->rb_right; - else if (offset >= skb1->ip_defrag_offset && - end <= skb1_run_end) - goto err; /* No new data, potential duplicate */ - else - goto overlap; /* Found an overlap */ - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. - */ - ip4_frag_init_run(skb); - rb_link_node(&skb->rbnode, parent, rbn); - rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - } + err = inet_frag_queue_insert(&qp->q, skb, offset, end); + if (err) + goto insert_error; if (dev) qp->iif = dev->ifindex; - skb->ip_defrag_offset = offset; qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; @@ -501,10 +381,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) skb_dst_drop(skb); return -EINPROGRESS; -overlap: +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb(skb); return err; @@ -516,13 +402,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); - struct sk_buff **nextp; /* To build frag_list. */ - struct rb_node *rbn; - int len; - int ihlen; - int delta; - int err; + void *reasm_data; + int len, err; u8 ecn; ipq_kill(qp); @@ -532,117 +413,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, err = -EINVAL; goto out_fail; } - /* Make the one we just received the head. */ - if (head != skb) { - fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - goto out_nomem; - FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; - if (RB_EMPTY_NODE(&skb->rbnode)) - FRAG_CB(prev_tail)->next_frag = fp; - else - rb_replace_node(&skb->rbnode, &fp->rbnode, - &qp->q.rb_fragments); - if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; - skb_morph(skb, head); - FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; - rb_replace_node(&head->rbnode, &skb->rbnode, - &qp->q.rb_fragments); - consume_skb(head); - head = skb; - } - WARN_ON(head->ip_defrag_offset != 0); - - /* Allocate a new buffer for the datagram. */ - ihlen = ip_hdrlen(head); - len = ihlen + qp->q.len; + /* Make the one we just received the head. */ + reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); + if (!reasm_data) + goto out_nomem; + len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_nomem; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(qp->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); - skb_shinfo(head)->frag_list = clone; - nextp = &clone->next; - } else { - nextp = &skb_shinfo(head)->frag_list; - } - - skb_push(head, head->data - skb_network_header(head)); + inet_frag_reasm_finish(&qp->q, skb, reasm_data); - /* Traverse the tree in order, to build frag_list. */ - fp = FRAG_CB(head)->next_frag; - rbn = rb_next(&head->rbnode); - rb_erase(&head->rbnode, &qp->q.rb_fragments); - while (rbn || fp) { - /* fp points to the next sk_buff in the current run; - * rbn points to the next run. - */ - /* Go through the current run. */ - while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; - } - /* Move to the next run. */ - if (rbn) { - struct rb_node *rbnext = rb_next(rbn); - - fp = rb_to_skb(rbn); - rb_erase(rbn, &qp->q.rb_fragments); - rbn = rbnext; - } - } - sub_frag_mem_limit(qp->q.net, head->truesize); - - *nextp = NULL; - skb_mark_not_on_list(head); - head->prev = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); + skb->dev = dev; + IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - iph = ip_hdr(head); + iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; @@ -655,7 +442,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { - IPCB(head)->flags |= IPSKB_FRAG_PMTU; + IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; @@ -753,28 +540,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); -unsigned int inet_frag_rbtree_purge(struct rb_root *root) -{ - struct rb_node *p = rb_first(root); - unsigned int sum = 0; - - while (p) { - struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - - p = rb_next(p); - rb_erase(&skb->rbnode, root); - while (skb) { - struct sk_buff *next = FRAG_CB(skb)->next_frag; - - sum += skb->truesize; - kfree_skb(skb); - skb = next; - } - } - return sum; -} -EXPORT_SYMBOL(inet_frag_rbtree_purge); - #ifdef CONFIG_SYSCTL static int dist_min; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 20a64fe6254b..d1cef66820d3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -449,81 +449,14 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } -static struct rtable *gre_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key) -{ - struct net *net = dev_net(dev); - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_GRE; - - return ip_route_output_key(net, fl); -} - -static struct rtable *prepare_fb_xmit(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - int tunnel_hlen) -{ - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - int min_headroom; - bool use_cache; - int err; - - tun_info = skb_tunnel_info(skb); - key = &tun_info->key; - use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); - - if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); - if (!rt) { - rt = gre_get_rt(skb, dev, fl, key); - if (IS_ERR(rt)) - goto err_free_skb; - if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl->saddr); - } - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - return rt; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NULL; -} - static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - struct flowi4 fl; int tunnel_hlen; - __be16 df, flags; + __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -533,13 +466,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, key = &tun_info->key; tunnel_hlen = gre_calc_hlen(key->tun_flags); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; /* Push Tunnel header. */ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) - goto err_free_rt; + goto err_free_skb; flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); @@ -547,14 +479,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; @@ -566,10 +494,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; - struct rtable *rt = NULL; bool truncate = false; - __be16 df, proto; - struct flowi4 fl; + __be16 proto; int tunnel_hlen; int version; int nhoff; @@ -582,21 +508,20 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) - goto err_free_rt; + goto err_free_skb; md = ip_tunnel_info_opts(tun_info); if (!md) - goto err_free_rt; + goto err_free_skb; /* ERSPAN has fixed 8 byte GRE header */ version = md->version; tunnel_hlen = 8 + erspan_hdr_len(version); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; if (gre_handle_offloads(skb, false)) - goto err_free_rt; + goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { pskb_trim(skb, dev->mtu + dev->hard_header_len); @@ -625,20 +550,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) truncate, true); proto = htons(ETH_P_ERSPAN2); } else { - goto err_free_rt; + goto err_free_skb; } gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(tunnel->o_seqno++)); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; @@ -647,13 +568,18 @@ err_free_skb: static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_key *key; struct rtable *rt; struct flowi4 fl4; if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; - rt = gre_get_rt(skb, dev, &fl4, &info->key); + key = &info->key; + ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), key->tos, 0, + skb->mark); + rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 51d8efba6de2..cd6b5694f99e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -429,7 +429,6 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -521,6 +520,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 054d01c16dc6..893f013d5369 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -501,15 +501,19 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, - const struct iphdr *inner_iph) + const struct iphdr *inner_iph, + int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); - int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; + int pkt_size; int mtu; + tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; + pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; + if (df) mtu = dst_mtu(&rt->dst) - dev->hard_header_len - - sizeof(struct iphdr) - tunnel->hlen; + - sizeof(struct iphdr) - tunnel_hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -527,11 +531,13 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + __be32 daddr; + + daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); @@ -548,17 +554,19 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; - struct rtable *rt; + struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; + bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -574,20 +582,39 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), RT_TOS(tos), + 0, skb->mark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; - rt = ip_route_output_key(tunnel->net, &fl4); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } + + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, + key->u.ipv4.dst, true)) { + ip_rt_put(rt); + goto tx_error; + } + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { @@ -598,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else ttl = ip4_dst_hoplimit(&rt->dst); } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - else if (skb->protocol == htons(ETH_P_IP)) + + if (!df && skb->protocol == htons(ETH_P_IP)) df = inner_iph->frag_off & htons(IP_DF); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (headroom > dev->needed_headroom) dev->needed_headroom = headroom; @@ -737,7 +764,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, + 0, 0, false)) { ip_rt_put(rt); goto tx_error; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 57c5dd283a2c..fe10b9a2efc8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -302,7 +302,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) - ip_md_tunnel_xmit(skb, dev, ipproto); + ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ddbf8c9a1abb..fb99002c3d4e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2467,6 +2467,61 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } +static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_SRC: + case RTA_DST: + case RTA_TABLE: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); + return -EINVAL; + } + } + + return 0; +} + static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2475,18 +2530,14 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; - struct rtmsg *rtm; __be32 src, grp; u32 tableid; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; - rtm = nlmsg_data(nlh); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 8d2e5dc9a827..a058213b77a7 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,24 +80,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) -{ - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, skb, - RTN_UNSPEC); - } - return 0; -} -EXPORT_SYMBOL_GPL(nf_ip_reroute); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 2687db015b6f..e26165af45cb 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -214,7 +214,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } /* Change outer to look like the reply to an incoming packet */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce92f73cf104..99be68b15da0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2763,6 +2763,75 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, return skb; } +static int inet_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, + "ipv4: Invalid header for route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); + return -EINVAL; + } + + if (rtm->rtm_flags & ~(RTM_F_NOTIFY | + RTM_F_LOOKUP_TABLE | + RTM_F_FIB_MATCH)) { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_IIF: + case RTA_OIF: + case RTA_SRC: + case RTA_DST: + case RTA_IP_PROTO: + case RTA_SPORT: + case RTA_DPORT: + case RTA_MARK: + case RTA_UID: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2783,8 +2852,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err; int mark; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, - extack); + err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2079145a3b7c..6f8d292ad501 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1127,7 +1127,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, - int *copied, size_t size) + int *copied, size_t size, + struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -1147,6 +1148,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; + tp->fastopen_req->uarg = uarg; if (inet->defer_connect) { err = tcp_connect(sk); @@ -1186,11 +1188,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { - if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { - err = -EINVAL; - goto out_err; - } - skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { @@ -1205,7 +1202,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && !tp->repair) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) @@ -1554,7 +1551,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && - !icsk->icsk_ack.pingpong)) && + !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } @@ -2572,14 +2569,16 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; - tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tp->delivered_ce = 0; @@ -2603,6 +2602,23 @@ int tcp_disconnect(struct sock *sk, int flags) tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; + tp->retrans_out = 0; + tp->sacked_out = 0; + tp->tlp_high_seq = 0; + tp->last_oow_ack_time = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + tp->rack.mstamp = 0; + tp->rack.advanced = 0; + tp->rack.reo_wnd_steps = 1; + tp->rack.last_delivered = 0; + tp->rack.reo_wnd_persist = 0; + tp->rack.dsack_seen = 0; + tp->syn_data_acked = 0; + tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2968,16 +2984,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_QUICKACK: if (!val) { - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } else { - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } } break; @@ -3391,7 +3407,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; } case TCP_QUICKACK: - val = !icsk->icsk_ack.pingpong; + val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0f497fc49c3f..56be7d27f208 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr { unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + unused_c:6; }; #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ @@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48; +/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; + static void bbr_check_probe_rtt_done(struct sock *sk); /* Do we estimate that STARTUP filled the pipe? */ @@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk) return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); } +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + /* Return rate in bytes per second, optionally with a gain. * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) if (event == CA_EVENT_TX_START && tp->app_limited) { bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; /* Avoid pointless buffer overflows: pace at est. bw if we don't * need more speed (we're restarting from idle and app-limited). */ @@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } } -/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT * measurements (e.g., delayed ACKs or other ACK compression effects). This * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) { struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd; + u32 bdp; u64 w; /* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; /* Apply a gain to the given value, then remove the BW_SCALE shift. */ - cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); /* Allow enough full-sized skbs in flight to utilize end systems. */ cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight, gain); + + return inflight; +} + /* With pacing at lower layers, there's often less data "in the network" than * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), * we often have several skbs queued in the pacing layer with a pre-scheduled @@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) return inflight_at_edt - interval_delivered; } +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done; + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ - target_cwnd = bbr_target_cwnd(sk, bw, gain); if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain > BBR_UNIT) return is_full_length && (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); /* A pacing_gain < 1.0 tries to drain extra queue we added if bw * probing didn't find more bw. If inflight falls to match BDP then we * estimate queue is drained; persisting would underutilize the pipe. */ return is_full_length || - inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); + inflight <= bbr_inflight(sk, bw, BBR_UNIT); } static void bbr_advance_cycle_phase(struct sock *sk) @@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) } } +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + /* Estimate when the pipe is full, using the change in delivery rate: BBR * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ tcp_sk(sk)->snd_ssthresh = - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); + bbr_update_ack_aggregation(sk, rs); bbr_update_cycle_phase(sk, rs); bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); @@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk) bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 76858b14ebe9..7a027dec649b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -221,7 +221,7 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } EXPORT_SYMBOL(tcp_enter_quickack_mode); @@ -236,7 +236,7 @@ static bool tcp_in_quickack_mode(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); return (dst && dst_metric(dst, RTAX_QUICKACK)) || - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -4094,7 +4094,7 @@ void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); break; case TCP_CLOSE_WAIT: @@ -5889,7 +5889,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || - icsk->icsk_ack.pingpong) { + inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index efc6fef692ff..662b034f1795 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2437,7 +2437,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), tp->snd_cwnd, state == TCP_LISTEN ? fastopenq->max_qlen : diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d9..182595e2d40f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -479,43 +479,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - tcp_init_xmit_timers(newsk); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); @@ -556,13 +529,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 730bc44dbad9..96bdb8eae9bb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -165,13 +165,16 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - tp->lsndtime = now; - - /* If it is a reply for ato after last received - * packet, enter pingpong mode. + /* If this is the first data packet sent in response to the + * previous received data, + * and it is a reply for ato after last received packet, + * increase pingpong count. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && + (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_inc_pingpong_cnt(sk); + + tp->lsndtime = now; } /* Account for an ACK we sent. */ @@ -980,7 +983,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = sk->sk_pacing_rate; @@ -1028,7 +1030,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); - + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; @@ -1045,11 +1049,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, return -ENOBUFS; } - prior_wstamp = tp->tcp_wstamp_ns; - tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; - inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); @@ -2937,12 +2936,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); @@ -2963,13 +2966,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); - - /* Save stamp of the first retransmit. */ - if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); - } + /* Save stamp of the first (attempted) retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); @@ -3456,6 +3458,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) skb_trim(syn_data, copied); space = copied; } + skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) @@ -3569,7 +3572,7 @@ void tcp_send_delayed_ack(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || + if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; @@ -3750,7 +3753,7 @@ void tcp_send_probe0(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - unsigned long probe_max; + unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); @@ -3762,26 +3765,18 @@ void tcp_send_probe0(struct sock *sk) return; } + icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; - icsk->icsk_probes_out++; - probe_max = TCP_RTO_MAX; + timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember icsk_probes_out. - * Let local senders to fight for local resources. - * - * Use accumulated backoff yet. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_probes_out) - icsk->icsk_probes_out = 1; - probe_max = TCP_RESOURCE_PROBE_INTERVAL; - } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_when(sk, probe_max), - TCP_RTO_MAX, - NULL); + timeout = TCP_RESOURCE_PROBE_INTERVAL; + } + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 71a29e9c0620..f0c86398e6a7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,28 +22,14 @@ #include <linux/gfp.h> #include <net/tcp.h> -static u32 tcp_retransmit_stamp(const struct sock *sk) -{ - u32 start_ts = tcp_sk(sk)->retrans_stamp; - - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return 0; - start_ts = tcp_skb_timestamp(head); - } - return start_ts; -} - static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; s32 remaining; - start_ts = tcp_retransmit_stamp(sk); - if (!icsk->icsk_user_timeout || !start_ts) + start_ts = tcp_sk(sk)->retrans_stamp; + if (!icsk->icsk_user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; remaining = icsk->icsk_user_timeout - elapsed; @@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } - +static unsigned int tcp_model_timeout(struct sock *sk, + unsigned int boundary, + unsigned int rto_base) +{ + unsigned int linear_backoff_thresh, timeout; + + linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + return jiffies_to_msecs(timeout); +} /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket @@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - const unsigned int rto_base = TCP_RTO_MIN; - unsigned int linear_backoff_thresh, start_ts; + unsigned int start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_retransmit_stamp(sk); - if (!start_ts) - return false; - - if (likely(timeout == 0)) { - linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + start_ts = tcp_sk(sk)->retrans_stamp; + if (likely(timeout == 0)) + timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * rto_base; - else - timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; - timeout = jiffies_to_msecs(timeout); - } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } @@ -289,14 +277,14 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { - if (!icsk->icsk_ack.pingpong) { + if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; - u32 start_ts; if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; @@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(skb); - if (!start_ts) - skb->skb_mstamp_ns = tp->tcp_clock_cache; - else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) - goto abort; + if (icsk->icsk_user_timeout) { + u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, + tcp_probe0_base(sk)); + + if (elapsed >= icsk->icsk_user_timeout) + goto abort; + } max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { @@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk) inet_rtx_syn_ack(sk, req); req->num_timeout++; icsk->icsk_retransmits++; + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } @@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk) */ return; } - if (!tp->packets_out) - goto out; - - WARN_ON(tcp_rtx_queue_empty(sk)); + if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) + return; tp->tlp_high_seq = 0; @@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); + icsk->icsk_retransmits++; if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, - * do not backoff. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_retransmits) - icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RESOURCE_PROBE_INTERVAL, TCP_RTO_MAX); goto out; } @@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk) * the 120 second clamps though! */ icsk->icsk_backoff++; - icsk->icsk_retransmits++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index be8b5b2157d8..e93cc0379201 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -21,18 +21,9 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - struct net_device *dev; - - dev = dev_get_by_index(net, cfg->bind_ifindex); - if (!dev) { - err = -ENODEV; - goto error; - } - - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, - dev->name, strlen(dev->name) + 1); - dev_put(dev); - + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, + (void *)&cfg->bind_ifindex, + sizeof(cfg->bind_ifindex)); if (err < 0) goto error; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 84c358804355..dcb1d434f7da 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -597,6 +597,43 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; +static int inet6_netconf_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv6_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv6_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETCONFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETCONFA_IFINDEX: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet6_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) @@ -605,14 +642,12 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, struct nlattr *tb[NETCONFA_MAX+1]; struct inet6_dev *in6_dev = NULL; struct net_device *dev = NULL; - struct netconfmsg *ncm; struct sk_buff *skb; struct ipv6_devconf *devconf; int ifindex; int err; - err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv6_policy, extack); + err = inet6_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) return err; @@ -5181,6 +5216,52 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) return inet6_dump_addr(skb, cb, type); } +static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifaddrmsg *ifm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); + if (err) + return err; + + for (i = 0; i <= IFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case IFA_TARGET_NETNSID: + case IFA_ADDRESS: + case IFA_LOCAL: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get address request"); + return -EINVAL; + } + } + + return 0; +} + static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -5201,8 +5282,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, - extack); + err = inet6_rtm_valid_getaddr_req(in_skb, nlh, tb, extack); if (err < 0) return err; @@ -6824,6 +6904,11 @@ static int __net_init addrconf_init_net(struct net *net) if (!dflt) goto err_alloc_dflt; + if (sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); + memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt)); + } + /* these will be inherited by all namespaces */ dflt->autoconf = ipv6_defaults.autoconf; dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 0d1ee82ee55b..d43d076c98f5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -523,6 +523,50 @@ static inline int ip6addrlbl_msgsize(void) + nla_total_size(4); /* IFAL_LABEL */ } +static int ip6addrlbl_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifaddrlblmsg *ifal; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, + ifal_policy, extack); + + ifal = nlmsg_data(nlh); + if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifal), tb, IFAL_MAX, + ifal_policy, extack); + if (err) + return err; + + for (i = 0; i <= IFAL_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case IFAL_ADDRESS: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in addrlabel get request"); + return -EINVAL; + } + } + + return 0; +} + static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -535,8 +579,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct ip6addrlbl_entry *p; struct sk_buff *skb; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, - extack); + err = ip6addrlbl_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) return err; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4416368dbd49..e081e69d534e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -524,7 +524,7 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; } -static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, +static int ip6erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi) { struct erspan_base_hdr *ershdr; @@ -607,7 +607,7 @@ static int gre_rcv(struct sk_buff *skb) if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { - if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + if (ip6erspan_rcv(skb, &tpi) == PACKET_RCVD) return 0; goto out; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index ad1a9ccd4b44..25430c991cea 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -32,18 +32,9 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - struct net_device *dev; - - dev = dev_get_by_index(net, cfg->bind_ifindex); - if (!dev) { - err = -ENODEV; - goto error; - } - - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, - dev->name, strlen(dev->name) + 1); - dev_put(dev); - + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, + (void *)&cfg->bind_ifindex, + sizeof(cfg->bind_ifindex)); if (err < 0) goto error; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 21f6deb2aec9..42f3f5cd349f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -940,6 +940,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); } +EXPORT_SYMBOL(ipv6_dev_mc_inc); /* * device multicast group del @@ -987,6 +988,7 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) return err; } +EXPORT_SYMBOL(ipv6_dev_mc_dec); /* * check if the interface/address pair is valid diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c index 9405b04eecc6..55e2ac179f28 100644 --- a/net/ipv6/mcast_snoop.c +++ b/net/ipv6/mcast_snoop.c @@ -41,6 +41,8 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) if (skb->len < len || len <= offset) return -EINVAL; + skb_set_transport_header(skb, offset); + return 0; } @@ -77,27 +79,27 @@ static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) len += sizeof(struct mld2_report); - return pskb_may_pull(skb, len) ? 0 : -EINVAL; + return ipv6_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ipv6_mc_check_mld_query(struct sk_buff *skb) { + unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; - unsigned int len = skb_transport_offset(skb); + unsigned int len; /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) return -EINVAL; - len += sizeof(struct mld_msg); - if (skb->len < len) - return -EINVAL; - /* MLDv1? */ - if (skb->len != len) { + if (transport_len != sizeof(struct mld_msg)) { /* or MLDv2? */ - len += sizeof(struct mld2_query) - sizeof(struct mld_msg); - if (skb->len < len || !pskb_may_pull(skb, len)) + if (transport_len < sizeof(struct mld2_query)) + return -EINVAL; + + len = skb_transport_offset(skb) + sizeof(struct mld2_query); + if (!ipv6_mc_may_pull(skb, len)) return -EINVAL; } @@ -115,7 +117,13 @@ static int ipv6_mc_check_mld_query(struct sk_buff *skb) static int ipv6_mc_check_mld_msg(struct sk_buff *skb) { - struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + struct mld_msg *mld; + + if (!ipv6_mc_may_pull(skb, len)) + return -EINVAL; + + mld = (struct mld_msg *)skb_transport_header(skb); switch (mld->mld_type) { case ICMPV6_MGM_REDUCTION: @@ -136,49 +144,30 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } -static int __ipv6_mc_check_mld(struct sk_buff *skb, - struct sk_buff **skb_trimmed) - +int ipv6_mc_check_icmpv6(struct sk_buff *skb) { - struct sk_buff *skb_chk = NULL; - unsigned int transport_len; - unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); - int ret = -EINVAL; + unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr); + unsigned int transport_len = ipv6_transport_len(skb); + struct sk_buff *skb_chk; - transport_len = ntohs(ipv6_hdr(skb)->payload_len); - transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + if (!ipv6_mc_may_pull(skb, len)) + return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ipv6_mc_validate_checksum); if (!skb_chk) - goto err; - - if (!pskb_may_pull(skb_chk, len)) - goto err; - - ret = ipv6_mc_check_mld_msg(skb_chk); - if (ret) - goto err; - - if (skb_trimmed) - *skb_trimmed = skb_chk; - /* free now unneeded clone */ - else if (skb_chk != skb) - kfree_skb(skb_chk); - - ret = 0; + return -EINVAL; -err: - if (ret && skb_chk && skb_chk != skb) + if (skb_chk != skb) kfree_skb(skb_chk); - return ret; + return 0; } +EXPORT_SYMBOL(ipv6_mc_check_icmpv6); /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet * @skb: the skb to validate - * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) * * Checks whether an IPv6 packet is a valid MLD packet. If so sets * skb transport header accordingly and returns zero. @@ -188,18 +177,10 @@ err: * -ENOMSG: IP header validation succeeded but it is not an MLD packet. * -ENOMEM: A memory allocation failure happened. * - * Optionally, an skb pointer might be provided via skb_trimmed (or set it - * to NULL): After parsing an MLD packet successfully it will point to - * an skb which has its tail aligned to the IP packet end. This might - * either be the originally provided skb or a trimmed, cloned version if - * the skb frame had data beyond the IP packet. A cloned skb allows us - * to leave the original skb and its full frame unchanged (which might be - * desirable for layer 2 frame jugglers). - * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ -int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +int ipv6_mc_check_mld(struct sk_buff *skb) { int ret; @@ -211,6 +192,10 @@ int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) if (ret < 0) return ret; - return __ipv6_mc_check_mld(skb, skb_trimmed); + ret = ipv6_mc_check_icmpv6(skb); + if (ret < 0) + return ret; + + return ipv6_mc_check_mld_msg(skb); } EXPORT_SYMBOL(ipv6_mc_check_mld); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 181da2c40f9a..cb1b4772dac0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -136,6 +136,9 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) } #endif +static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); + static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -177,9 +180,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { - struct sk_buff *prev, *next; unsigned int payload_len; - int offset, end; + struct net_device *dev; + struct sk_buff *prev; + int offset, end, err; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) { @@ -254,55 +258,18 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, goto err; } - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = fq->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = fq->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } - -found: - /* RFC5722, Section 4: - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments, including those not yet received) MUST be silently - * discarded. - */ - - /* Check for overlap with preceding fragment. */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_fq; - - /* Look for overlap with succeeding segment. */ - if (next && next->ip_defrag_offset < end) - goto discard_fq; - - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - if (skb->dev) - fq->iif = skb->dev->ifindex; + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); - skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - fq->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - fq->q.fragments = skb; + prev = fq->q.fragments_tail; + err = inet_frag_queue_insert(&fq->q, skb, offset, end); + if (err) + goto insert_error; + + if (dev) + fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; @@ -319,11 +286,25 @@ found: fq->q.flags |= INET_FRAG_FIRST_IN; } - return 0; + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + err = nf_ct_frag6_reasm(fq, skb, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); + return -EINPROGRESS; -discard_fq: +insert_error: + if (err == IPFRAG_DUP) + goto err; inet_frag_kill(&fq->q); err: + skb_dst_drop(skb); return -EINVAL; } @@ -333,147 +314,67 @@ err: * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. - * - * returns true if *prev skb has been transformed into the reassembled - * skb, false otherwise. */ -static bool -nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) +static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev) { - struct sk_buff *fp, *head = fq->q.fragments; - int payload_len, delta; + void *reasm_data; + int payload_len; u8 ecn; inet_frag_kill(&fq->q); - WARN_ON(head == NULL); - WARN_ON(head->ip_defrag_offset != 0); - ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) - return false; + goto err; + + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); + if (!reasm_data) + goto err; - /* Unfragmented part is taken from the first segment. */ - payload_len = ((head->data - skb_network_header(head)) - + payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); - return false; - } - - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - return false; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(fq->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (clone == NULL) - return false; - - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - - add_frag_mem_limit(fq->q.net, clone->truesize); - } - - /* morph head into last received skb: prev. - * - * This allows callers of ipv6 conntrack defrag to continue - * to use the last skb(frag) passed into the reasm engine. - * The last skb frag 'silently' turns into the full reassembled skb. - * - * Since prev is also part of q->fragments we have to clone it first. - */ - if (head != prev) { - struct sk_buff *iter; - - fp = skb_clone(prev, GFP_ATOMIC); - if (!fp) - return false; - - fp->next = prev->next; - - iter = head; - while (iter) { - if (iter->next == prev) { - iter->next = fp; - break; - } - iter = iter->next; - } - - skb_morph(prev, head); - prev->next = head->next; - consume_skb(head); - head = prev; + goto err; } /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ - skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; - memmove(head->head + sizeof(struct frag_hdr), head->head, - (head->data - head->head) - sizeof(struct frag_hdr)); - head->mac_header += sizeof(struct frag_hdr); - head->network_header += sizeof(struct frag_hdr); - - skb_shinfo(head)->frag_list = head->next; - skb_reset_transport_header(head); - skb_push(head, head->data - skb_network_header(head)); - - for (fp = head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp->sk = NULL; - } - sub_frag_mem_limit(fq->q.net, head->truesize); + skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; + memmove(skb->head + sizeof(struct frag_hdr), skb->head, + (skb->data - skb->head) - sizeof(struct frag_hdr)); + skb->mac_header += sizeof(struct frag_hdr); + skb->network_header += sizeof(struct frag_hdr); + + skb_reset_transport_header(skb); + + inet_frag_reasm_finish(&fq->q, skb, reasm_data); - head->ignore_df = 1; - skb_mark_not_on_list(head); - head->dev = dev; - head->tstamp = fq->q.stamp; - ipv6_hdr(head)->payload_len = htons(payload_len); - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); - IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + skb->ignore_df = 1; + skb->dev = dev; + ipv6_hdr(skb)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); + IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_partial(skb_network_header(skb), + skb_network_header_len(skb), + skb->csum); fq->q.fragments = NULL; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; + fq->q.last_run_head = NULL; - return true; + return 0; + +err: + inet_frag_kill(&fq->q); + return -EINVAL; } /* @@ -542,7 +443,6 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; - struct net_device *dev = skb->dev; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; @@ -565,10 +465,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - return -EINVAL; - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); @@ -580,31 +476,17 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); - if (ret < 0) { - if (ret == -EPROTO) { - skb->transport_header = savethdr; - ret = 0; - } - goto out_unlock; + if (ret == -EPROTO) { + skb->transport_header = savethdr; + ret = 0; } /* after queue has assumed skb ownership, only 0 or -EINPROGRESS * must be returned. */ - ret = -EINPROGRESS; - if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len) { - unsigned long orefdst = skb->_skb_refdst; - - skb->_skb_refdst = 0UL; - if (nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - skb->_skb_refdst = orefdst; - } else { - skb_dst_drop(skb); - } + if (ret) + ret = -EINPROGRESS; -out_unlock: spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q); return ret; diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 23022447eb49..9c914db44bec 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -225,7 +225,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, skb->len - hdrlen, 0)); } - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 36a3d8dc61f5..24264d0a4b85 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -69,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, - struct net_device *dev); +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); static void ip6_frag_expire(struct timer_list *t) { @@ -111,21 +111,26 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset) { - struct sk_buff *prev, *next; - struct net_device *dev; - int offset, end, fragsize; struct net *net = dev_net(skb_dst(skb)->dev); + int offset, end, fragsize; + struct sk_buff *prev_tail; + struct net_device *dev; + int err = -ENOENT; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) goto err; + err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); + /* note that if prob_offset is set, the skb is freed elsewhere, + * we do not free it here. + */ return -1; } @@ -170,62 +175,27 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (end == offset) goto discard_fq; + err = -ENOMEM; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) goto discard_fq; - if (pskb_trim_rcsum(skb, end - offset)) + err = pskb_trim_rcsum(skb, end - offset); + if (err) goto discard_fq; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = fq->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = fq->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } - -found: - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - */ - - /* Check for overlap with preceding fragment. */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_fq; - - /* Look for overlap with succeeding segment. */ - if (next && next->ip_defrag_offset < end) - goto discard_fq; - - /* Note : skb->ip_defrag_offset and skb->sk share the same location */ + /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; - if (dev) - fq->iif = dev->ifindex; /* Makes sure compiler wont do silly aliasing games */ barrier(); - skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - fq->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - fq->q.fragments = skb; + prev_tail = fq->q.fragments_tail; + err = inet_frag_queue_insert(&fq->q, skb, offset, end); + if (err) + goto insert_error; + + if (dev) + fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; @@ -246,44 +216,48 @@ found: if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { - int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = ip6_frag_reasm(fq, prev, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev); skb->_skb_refdst = orefdst; - return res; + return err; } skb_dst_drop(skb); - return -1; + return -EINPROGRESS; +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASM_OVERLAPS); discard_fq: inet_frag_kill(&fq->q); -err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); +err: kfree_skb(skb); - return -1; + return err; } /* * Check if this packet is complete. - * Returns NULL on failure by any reason, and pointer - * to current nexthdr field in reassembled frame. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, - struct net_device *dev) +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); - struct sk_buff *fp, *head = fq->q.fragments; - int payload_len, delta; unsigned int nhoff; - int sum_truesize; + void *reasm_data; + int payload_len; u8 ecn; inet_frag_kill(&fq->q); @@ -292,121 +266,40 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (unlikely(ecn == 0xff)) goto out_fail; - /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); - - if (!fp) - goto out_oom; - - fp->next = head->next; - if (!fp->next) - fq->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, fq->q.fragments); - head->next = fq->q.fragments->next; - - consume_skb(fq->q.fragments); - fq->q.fragments = head; - } - - WARN_ON(head == NULL); - WARN_ON(head->ip_defrag_offset != 0); + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); + if (!reasm_data) + goto out_oom; - /* Unfragmented part is taken from the first segment. */ - payload_len = ((head->data - skb_network_header(head)) - + payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) goto out_oversize; - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_oom; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(fq->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_oom; - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(fq->q.net, clone->truesize); - } - /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; - skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; - memmove(head->head + sizeof(struct frag_hdr), head->head, - (head->data - head->head) - sizeof(struct frag_hdr)); - if (skb_mac_header_was_set(head)) - head->mac_header += sizeof(struct frag_hdr); - head->network_header += sizeof(struct frag_hdr); - - skb_reset_transport_header(head); - skb_push(head, head->data - skb_network_header(head)); - - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - fp->sk = NULL; - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; - } - sub_frag_mem_limit(fq->q.net, sum_truesize); + skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; + memmove(skb->head + sizeof(struct frag_hdr), skb->head, + (skb->data - skb->head) - sizeof(struct frag_hdr)); + if (skb_mac_header_was_set(skb)) + skb->mac_header += sizeof(struct frag_hdr); + skb->network_header += sizeof(struct frag_hdr); + + skb_reset_transport_header(skb); + + inet_frag_reasm_finish(&fq->q, skb, reasm_data); - skb_mark_not_on_list(head); - head->dev = dev; - head->tstamp = fq->q.stamp; - ipv6_hdr(head)->payload_len = htons(payload_len); - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); - IP6CB(head)->nhoff = nhoff; - IP6CB(head)->flags |= IP6SKB_FRAGMENTED; - IP6CB(head)->frag_max_size = fq->q.max_size; + skb->dev = dev; + ipv6_hdr(skb)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); + IP6CB(skb)->nhoff = nhoff; + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; + IP6CB(skb)->frag_max_size = fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ - skb_postpush_rcsum(head, skb_network_header(head), - skb_network_header_len(head)); + skb_postpush_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); @@ -414,6 +307,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, fq->q.fragments = NULL; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; + fq->q.last_run_head = NULL; return 1; out_oversize: @@ -464,10 +358,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - goto fail_hdr; - iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { @@ -485,6 +375,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); + /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); } return ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 964491cf3672..dc066fdf7e46 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4812,6 +4812,73 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg) arg->cb->nlh->nlmsg_seq, flags); } +static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid header for get route request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || + rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || + rtm->rtm_type) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); + return -EINVAL; + } + if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid flags for get route request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_SRC: + case RTA_DST: + case RTA_IIF: + case RTA_OIF: + case RTA_MARK: + case RTA_UID: + case RTA_SPORT: + case RTA_DPORT: + case RTA_IP_PROTO: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); + return -EINVAL; + } + } + + return 0; +} + static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -4826,8 +4893,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct flowi6 fl6 = {}; bool fibmatch; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, - extack); + err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b81eb7cb815e..e51cda79f0cc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1864,7 +1864,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tp->snd_cwnd, state == TCP_LISTEN ? fastopenq->max_qlen : diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7d55d4c04088..2662a23c658e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1209,21 +1209,57 @@ static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, }; +static int mpls_netconf_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid header for netconf get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_mpls_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_mpls_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETCONFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETCONFA_IFINDEX: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request"); + return -EINVAL; + } + } + + return 0; +} + static int mpls_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; - struct netconfmsg *ncm; struct net_device *dev; struct mpls_dev *mdev; struct sk_buff *skb; int ifindex; int err; - err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_mpls_policy, extack); + err = mpls_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; @@ -2236,6 +2272,64 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); } +static int mpls_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid header for get route request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || + rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || + rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); + return -EINVAL; + } + if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { + NL_SET_ERR_MSG_MOD(extack, + "Invalid flags for get route request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); + if (err) + return err; + + if ((tb[RTA_DST] || tb[RTA_NEWDST]) && !rtm->rtm_dst_len) { + NL_SET_ERR_MSG_MOD(extack, "rtm_dst_len must be 20 for MPLS"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_DST: + case RTA_NEWDST: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); + return -EINVAL; + } + } + + return 0; +} + static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct netlink_ext_ack *extack) { @@ -2255,8 +2349,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, u8 n_labels; int err; - err = nlmsg_parse(in_nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_mpls_policy, extack); + err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index beb3a69ce1d4..fefd63a243f2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -174,7 +174,7 @@ config NF_CT_PROTO_DCCP If unsure, say Y. config NF_CT_PROTO_GRE - tristate + bool config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1ae65a314d7a..e66067befa42 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -13,6 +13,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o +nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -25,8 +26,6 @@ obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o -obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o - # netlink interface for nf_conntrack obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index fe9abf3cc10a..e969dad66991 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -53,6 +53,7 @@ #endif #include <net/ip_vs.h> +#include <linux/indirect_call_wrapper.h> EXPORT_SYMBOL(register_ip_vs_scheduler); @@ -70,6 +71,29 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); +#ifdef CONFIG_IP_VS_PROTO_TCP +INDIRECT_CALLABLE_DECLARE(int + tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP +INDIRECT_CALLABLE_DECLARE(int + udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)); +#endif + +#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) +#define SNAT_CALL(f, ...) \ + INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) +#elif defined(CONFIG_IP_VS_PROTO_TCP) +#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) +#elif defined(CONFIG_IP_VS_PROTO_UDP) +#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) +#else +#define SNAT_CALL(f, ...) f(__VA_ARGS__) +#endif + static unsigned int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); @@ -478,7 +502,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { iph->hdr_flags ^= IP_VS_HDR_INVERSE; - cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); + cp = INDIRECT_CALL_1(pp->conn_in_get, + ip_vs_conn_in_get_proto, svc->ipvs, + svc->af, skb, iph); iph->hdr_flags ^= IP_VS_HDR_INVERSE; if (cp) { @@ -972,7 +998,8 @@ static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph); + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, AF_INET, skb, &ciph); if (!cp) return NF_ACCEPT; @@ -1028,7 +1055,8 @@ static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, return NF_ACCEPT; /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph); + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, AF_INET6, skb, &ciph); if (!cp) return NF_ACCEPT; @@ -1263,7 +1291,8 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, goto drop; /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph)) + if (pp->snat_handler && + !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 @@ -1389,7 +1418,8 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(ipvs, af, skb, &iph); + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, af, skb, &iph); if (likely(cp)) { if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) @@ -1644,7 +1674,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, /* The embedded headers contain source and dest in reverse order. * For IPIP this is error for request, not for reply. */ - cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph); + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, AF_INET, skb, &ciph); if (!cp) { int v; @@ -1796,7 +1827,8 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, /* The embedded headers contain source and dest in reverse order * if not from localhost */ - cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph); + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, AF_INET6, skb, &ciph); if (!cp) { int v; @@ -1925,7 +1957,8 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(ipvs, af, skb, &iph); + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, af, skb, &iph); conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 5320d39976e1..480598cb0f05 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -129,7 +129,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, - .csum_check = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, @@ -152,7 +151,6 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, - .csum_check = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index b0cd7d08f2a7..bc3d1625ecc8 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -10,6 +10,9 @@ #include <net/ip_vs.h> static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, @@ -105,7 +108,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ @@ -152,7 +155,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ @@ -587,7 +590,6 @@ struct ip_vs_protocol ip_vs_protocol_sctp = { .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, - .csum_check = sctp_csum_check, .state_name = sctp_state_name, .state_transition = sctp_state_transition, .app_conn_bind = sctp_app_conn_bind, diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 1770fc6ce960..479419759983 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -28,10 +28,14 @@ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> +#include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, @@ -143,7 +147,7 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, } -static int +INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { @@ -166,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ @@ -192,7 +196,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = (cp->app && pp->csum_check) ? + skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ @@ -244,7 +248,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* @@ -275,7 +279,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = (cp->app && pp->csum_check) ? + skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ @@ -736,7 +740,6 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 0f53c49025f8..646c384910fb 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -23,12 +23,16 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/udp.h> +#include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> #include <net/ip.h> #include <net/ip6_checksum.h> static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, @@ -133,7 +137,7 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, } -static int +INDIRECT_CALLABLE_SCOPE int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { @@ -156,7 +160,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp)) return 0; /* @@ -186,7 +190,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = (cp->app && pp->csum_check) ? + skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ @@ -239,7 +243,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp)) return 0; /* @@ -270,7 +274,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = (cp->app && pp->csum_check) ? + skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ @@ -494,7 +498,6 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, .state_transition = udp_state_transition, .state_name = udp_state_name, .register_app = udp_register_app, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 741b533148ba..815956ac5a76 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -222,6 +222,24 @@ static u32 hash_conntrack(const struct net *net, return scale_hash(hash_conntrack_raw(tuple, net)); } +static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ struct { + __be16 sport; + __be16 dport; + } _inet_hdr, *inet_hdr; + + /* Actually only need first 4 bytes to get ports. */ + inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); + if (!inet_hdr) + return false; + + tuple->src.u.udp.port = inet_hdr->sport; + tuple->dst.u.udp.port = inet_hdr->dport; + return true; +} + static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, @@ -229,16 +247,11 @@ nf_ct_get_tuple(const struct sk_buff *skb, u_int16_t l3num, u_int8_t protonum, struct net *net, - struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l4proto *l4proto) + struct nf_conntrack_tuple *tuple) { unsigned int size; const __be32 *ap; __be32 _addrs[8]; - struct { - __be16 sport; - __be16 dport; - } _inet_hdr, *inet_hdr; memset(tuple, 0, sizeof(*tuple)); @@ -274,16 +287,36 @@ nf_ct_get_tuple(const struct sk_buff *skb, tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; - if (unlikely(l4proto->pkt_to_tuple)) - return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); - - /* Actually only need first 4 bytes to get ports. */ - inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); - if (!inet_hdr) - return false; + switch (protonum) { +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); +#endif + case IPPROTO_ICMP: + return icmp_pkt_to_tuple(skb, dataoff, net, tuple); +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: + return gre_pkt_to_tuple(skb, dataoff, net, tuple); +#endif + case IPPROTO_TCP: + case IPPROTO_UDP: /* fallthrough */ + return nf_ct_get_tuple_ports(skb, dataoff, tuple); +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + case IPPROTO_UDPLITE: + return nf_ct_get_tuple_ports(skb, dataoff, tuple); +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + case IPPROTO_SCTP: + return nf_ct_get_tuple_ports(skb, dataoff, tuple); +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP + case IPPROTO_DCCP: + return nf_ct_get_tuple_ports(skb, dataoff, tuple); +#endif + default: + break; + } - tuple->src.u.udp.port = inet_hdr->sport; - tuple->dst.u.udp.port = inet_hdr->dport; return true; } @@ -366,33 +399,20 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l4proto *l4proto; u8 protonum; int protoff; - int ret; - - rcu_read_lock(); protoff = get_l4proto(skb, nhoff, l3num, &protonum); - if (protoff <= 0) { - rcu_read_unlock(); + if (protoff <= 0) return false; - } - - l4proto = __nf_ct_l4proto_find(protonum); - ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, - l4proto); - - rcu_read_unlock(); - return ret; + return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_tuple *orig) { memset(inverse, 0, sizeof(*inverse)); @@ -415,8 +435,14 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, inverse->dst.protonum = orig->dst.protonum; - if (unlikely(l4proto->invert_tuple)) - return l4proto->invert_tuple(inverse, orig); + switch (orig->dst.protonum) { + case IPPROTO_ICMP: + return nf_conntrack_invert_icmp_tuple(inverse, orig); +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + return nf_conntrack_invert_icmpv6_tuple(inverse, orig); +#endif + } inverse->src.u.all = orig->dst.u.all; inverse->dst.u.all = orig->src.u.all; @@ -526,11 +552,20 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl) } EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); +static void destroy_gre_conntrack(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CT_PROTO_GRE + struct nf_conn *master = ct->master; + + if (master) + nf_ct_gre_keymap_destroy(master); +#endif +} + static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - const struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); WARN_ON(atomic_read(&nfct->use) != 0); @@ -539,9 +574,9 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_tmpl_free(ct); return; } - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); - if (l4proto->destroy) - l4proto->destroy(ct); + + if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) + destroy_gre_conntrack(ct); local_bh_disable(); /* Expectations will have been removed in clean_from_lists, @@ -840,7 +875,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, enum ip_conntrack_info oldinfo; struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->allow_clash && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { @@ -1112,7 +1147,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1342,7 +1377,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash) { @@ -1355,7 +1389,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; - if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { + if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { pr_debug("Can't invert tuple.\n"); return NULL; } @@ -1437,7 +1471,6 @@ resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t protonum, - const struct nf_conntrack_l4proto *l4proto, const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; @@ -1450,7 +1483,7 @@ resolve_normal_ct(struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, - &tuple, l4proto)) { + &tuple)) { pr_debug("Can't get tuple\n"); return 0; } @@ -1460,7 +1493,7 @@ resolve_normal_ct(struct nf_conn *tmpl, hash = hash_conntrack_raw(&tuple, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { - h = init_conntrack(state->net, tmpl, &tuple, l4proto, + h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); if (!h) return 0; @@ -1522,10 +1555,66 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl, return ret; } +static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + const unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; + + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + return NF_ACCEPT; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int nf_conntrack_handle_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) +{ + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + return nf_conntrack_tcp_packet(ct, skb, dataoff, + ctinfo, state); + case IPPROTO_UDP: + return nf_conntrack_udp_packet(ct, skb, dataoff, + ctinfo, state); + case IPPROTO_ICMP: + return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + case IPPROTO_UDPLITE: + return nf_conntrack_udplite_packet(ct, skb, dataoff, + ctinfo, state); +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + case IPPROTO_SCTP: + return nf_conntrack_sctp_packet(ct, skb, dataoff, + ctinfo, state); +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP + case IPPROTO_DCCP: + return nf_conntrack_dccp_packet(ct, skb, dataoff, + ctinfo, state); +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: + return nf_conntrack_gre_packet(ct, skb, dataoff, + ctinfo, state); +#endif + } + + return generic_packet(ct, skb, ctinfo); +} + unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { - const struct nf_conntrack_l4proto *l4proto; enum ip_conntrack_info ctinfo; struct nf_conn *ct, *tmpl; u_int8_t protonum; @@ -1552,8 +1641,6 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) goto out; } - l4proto = __nf_ct_l4proto_find(protonum); - if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, protonum, state); @@ -1567,7 +1654,7 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) } repeat: ret = resolve_normal_ct(tmpl, skb, dataoff, - protonum, l4proto, state); + protonum, state); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(state->net, drop); @@ -1583,7 +1670,7 @@ repeat: goto out; } - ret = l4proto->packet(ct, skb, dataoff, ctinfo, state); + ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ @@ -1614,19 +1701,6 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_in); -bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig) -{ - bool ret; - - rcu_read_lock(); - ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l4proto_find(orig->dst.protonum)); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); - /* Alter reply tuple (maybe alter helper). This is for NAT, and is implicitly racy: see __nf_conntrack_confirm */ void nf_conntrack_alter_reply(struct nf_conn *ct, @@ -1757,7 +1831,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { - const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; @@ -1778,10 +1851,8 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) if (dataoff <= 0) return -1; - l4proto = nf_ct_l4proto_find_get(l4num); - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple, l4proto)) + l4num, net, &tuple)) return -1; if (ct->status & IPS_SRC_NAT) { @@ -2413,15 +2484,10 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); nf_conntrack_helper_pernet_init(net); + nf_conntrack_proto_pernet_init(net); - ret = nf_conntrack_proto_pernet_init(net); - if (ret < 0) - goto err_proto; return 0; -err_proto: - nf_conntrack_ecache_pernet_fini(net); - nf_conntrack_expect_pernet_fini(net); err_expect: free_percpu(net->ct.stat); err_pcpu_lists: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 3034038bfdf0..334d6e5b7762 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,7 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l4proto_find(expect->tuple.dst.protonum)); + nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1213beb5a714..8071bb04a849 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -134,7 +134,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb, ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); @@ -182,7 +182,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) struct nlattr *nest_proto; int ret; - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; @@ -590,7 +590,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); @@ -1059,7 +1059,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_PROTO_MAX, @@ -1722,11 +1722,9 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, if (err < 0) return err; - rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); - rcu_read_unlock(); return err; } @@ -2676,7 +2674,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 11562f2a08bb..976f1dcb97f0 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -121,7 +121,7 @@ static void pptp_expectfn(struct nf_conn *ct, struct nf_conntrack_expect *exp_other; /* obviously this tuple inversion only works until you do NAT */ - nf_ct_invert_tuplepr(&inv_t, &exp->tuple); + nf_ct_invert_tuple(&inv_t, &exp->tuple); pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple(&inv_t); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 859f5d07a915..b9403a266a2e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -43,40 +43,9 @@ extern unsigned int nf_conntrack_net_id; -static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly; - static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL -static int -nf_ct_register_sysctl(struct net *net, - struct ctl_table_header **header, - const char *path, - struct ctl_table *table) -{ - if (*header == NULL) { - *header = register_net_sysctl(net, path, table); - if (*header == NULL) - return -ENOMEM; - } - - return 0; -} - -static void -nf_ct_unregister_sysctl(struct ctl_table_header **header, - struct ctl_table **table, - unsigned int users) -{ - if (users > 0) - return; - - unregister_net_sysctl_table(*header); - kfree(*table); - *header = NULL; - *table = NULL; -} - __printf(5, 6) void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, @@ -124,295 +93,82 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif -const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto) -{ - if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos))) - return &nf_conntrack_l4proto_generic; - - return rcu_dereference(nf_ct_protos[l4proto]); -} -EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); - -const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num) -{ - const struct nf_conntrack_l4proto *p; - - rcu_read_lock(); - p = __nf_ct_l4proto_find(l4num); - if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); - -void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p) -{ - module_put(p->me); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); - -static int kill_l4proto(struct nf_conn *i, void *data) -{ - const struct nf_conntrack_l4proto *l4proto; - l4proto = data; - return nf_ct_protonum(i) == l4proto->l4proto; -} - -static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, - const struct nf_conntrack_l4proto *l4proto) -{ - if (l4proto->get_net_proto) { - /* statically built-in protocols use static per-net */ - return l4proto->get_net_proto(net); - } else if (l4proto->net_id) { - /* ... and loadable protocols use dynamic per-net */ - return net_generic(net, *l4proto->net_id); - } - return NULL; -} - -static -int nf_ct_l4proto_register_sysctl(struct net *net, - struct nf_proto_net *pn) -{ - int err = 0; - -#ifdef CONFIG_SYSCTL - if (pn->ctl_table != NULL) { - err = nf_ct_register_sysctl(net, - &pn->ctl_table_header, - "net/netfilter", - pn->ctl_table); - if (err < 0) { - if (!pn->users) { - kfree(pn->ctl_table); - pn->ctl_table = NULL; - } - } - } -#endif /* CONFIG_SYSCTL */ - return err; -} - -static -void nf_ct_l4proto_unregister_sysctl(struct nf_proto_net *pn) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table_header != NULL) - nf_ct_unregister_sysctl(&pn->ctl_table_header, - &pn->ctl_table, - pn->users); -#endif /* CONFIG_SYSCTL */ -} - -/* FIXME: Allow NULL functions and sub in pointers to generic for - them. --RR */ -int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto) -{ - int ret = 0; - - if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) || - (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) - return -EINVAL; - - mutex_lock(&nf_ct_proto_mutex); - if (rcu_dereference_protected( - nf_ct_protos[l4proto->l4proto], - lockdep_is_held(&nf_ct_proto_mutex) - ) != &nf_conntrack_l4proto_generic) { - ret = -EBUSY; - goto out_unlock; - } - - rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto); -out_unlock: - mutex_unlock(&nf_ct_proto_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one); - -int nf_ct_l4proto_pernet_register_one(struct net *net, - const struct nf_conntrack_l4proto *l4proto) -{ - int ret = 0; - struct nf_proto_net *pn = NULL; - - if (l4proto->init_net) { - ret = l4proto->init_net(net); - if (ret < 0) - goto out; - } - - pn = nf_ct_l4proto_net(net, l4proto); - if (pn == NULL) - goto out; - - ret = nf_ct_l4proto_register_sysctl(net, pn); - if (ret < 0) - goto out; - - pn->users++; -out: - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); - -static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) - -{ - BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos)); - - BUG_ON(rcu_dereference_protected( - nf_ct_protos[l4proto->l4proto], - lockdep_is_held(&nf_ct_proto_mutex) - ) != l4proto); - rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], - &nf_conntrack_l4proto_generic); -} - -void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) -{ - mutex_lock(&nf_ct_proto_mutex); - __nf_ct_l4proto_unregister_one(l4proto); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_net(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one); - -void nf_ct_l4proto_pernet_unregister_one(struct net *net, - const struct nf_conntrack_l4proto *l4proto) -{ - struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto); - - if (pn == NULL) - return; - - pn->users--; - nf_ct_l4proto_unregister_sysctl(pn); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); - -static void -nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) -{ - int i; - - mutex_lock(&nf_ct_proto_mutex); - for (i = 0; i < num_proto; i++) - __nf_ct_l4proto_unregister_one(l4proto[i]); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_net(); - - for (i = 0; i < num_proto; i++) - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]); -} - -static int -nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) -{ - int ret = -EINVAL; - unsigned int i; - - for (i = 0; i < num_proto; i++) { - ret = nf_ct_l4proto_register_one(l4proto[i]); - if (ret < 0) - break; - } - if (i != num_proto) { - pr_err("nf_conntrack: can't register l4 %d proto.\n", - l4proto[i]->l4proto); - nf_ct_l4proto_unregister(l4proto, i); - } - return ret; -} - -int nf_ct_l4proto_pernet_register(struct net *net, - const struct nf_conntrack_l4proto *const l4proto[], - unsigned int num_proto) +const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) { - int ret = -EINVAL; - unsigned int i; - - for (i = 0; i < num_proto; i++) { - ret = nf_ct_l4proto_pernet_register_one(net, l4proto[i]); - if (ret < 0) - break; - } - if (i != num_proto) { - pr_err("nf_conntrack %d: pernet registration failed\n", - l4proto[i]->l4proto); - nf_ct_l4proto_pernet_unregister(net, l4proto, i); + switch (l4proto) { + case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; + case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; + case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; +#ifdef CONFIG_NF_CT_PROTO_DCCP + case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + case IPPROTO_UDPLITE: return &nf_conntrack_l4proto_udplite; +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: return &nf_conntrack_l4proto_gre; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6; +#endif /* CONFIG_IPV6 */ } - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -void nf_ct_l4proto_pernet_unregister(struct net *net, - const struct nf_conntrack_l4proto *const l4proto[], - unsigned int num_proto) -{ - while (num_proto-- != 0) - nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); + return &nf_conntrack_l4proto_generic; +}; +EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) +static unsigned int nf_confirm(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; help = nfct_help(ct); - if (!help) - return NF_ACCEPT; + if (help) { + const struct nf_conntrack_helper *helper; + int ret; + + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (helper) { + ret = helper->help(skb, + protoff, + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + } - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); } static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - struct nf_conn *ct; enum ip_conntrack_info ctinfo; + struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; + return nf_conntrack_confirm(skb); - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); + return nf_confirm(skb, + skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); } static unsigned int ipv4_conntrack_in(void *priv, @@ -461,24 +217,12 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, @@ -623,31 +367,21 @@ static unsigned int ipv6_confirm(void *priv, struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned char pnum = ipv6_hdr(skb)->nexthdr; - int protoff; __be16 frag_off; + int protoff; ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; + return nf_conntrack_confirm(skb); protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); - goto out; + return nf_conntrack_confirm(skb); } - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); + return nf_confirm(skb, protoff, ct, ctinfo); } static unsigned int ipv6_conntrack_in(void *priv, @@ -664,42 +398,6 @@ static unsigned int ipv6_conntrack_local(void *priv, return nf_conntrack_in(skb, state); } -static unsigned int ipv6_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - enum ip_conntrack_info ctinfo; - __be16 frag_off; - int protoff; - u8 nexthdr; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - - return helper->help(skb, protoff, ct, ctinfo); -} - static const struct nf_hook_ops ipv6_conntrack_ops[] = { { .hook = ipv6_conntrack_in, @@ -714,24 +412,12 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { .hook = ipv6_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { .hook = ipv6_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, @@ -874,27 +560,9 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) } EXPORT_SYMBOL_GPL(nf_ct_netns_put); -static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { - &nf_conntrack_l4proto_tcp, - &nf_conntrack_l4proto_udp, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite, -#endif -#if IS_ENABLED(CONFIG_IPV6) - &nf_conntrack_l4proto_icmpv6, -#endif /* CONFIG_IPV6 */ -}; - int nf_conntrack_proto_init(void) { - int ret = 0, i; + int ret; ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) @@ -906,18 +574,8 @@ int nf_conntrack_proto_init(void) goto cleanup_sockopt; #endif - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - RCU_INIT_POINTER(nf_ct_protos[i], - &nf_conntrack_l4proto_generic); - - ret = nf_ct_l4proto_register(builtin_l4proto, - ARRAY_SIZE(builtin_l4proto)); - if (ret < 0) - goto cleanup_sockopt2; - return ret; -cleanup_sockopt2: - nf_unregister_sockopt(&so_getorigdst); + #if IS_ENABLED(CONFIG_IPV6) cleanup_sockopt: nf_unregister_sockopt(&so_getorigdst6); @@ -933,43 +591,33 @@ void nf_conntrack_proto_fini(void) #endif } -int nf_conntrack_proto_pernet_init(struct net *net) +void nf_conntrack_proto_pernet_init(struct net *net) { - int err; - struct nf_proto_net *pn = nf_ct_l4proto_net(net, - &nf_conntrack_l4proto_generic); - - err = nf_conntrack_l4proto_generic.init_net(net); - if (err < 0) - return err; - err = nf_ct_l4proto_register_sysctl(net, - pn); - if (err < 0) - return err; - - err = nf_ct_l4proto_pernet_register(net, builtin_l4proto, - ARRAY_SIZE(builtin_l4proto)); - if (err < 0) { - nf_ct_l4proto_unregister_sysctl(pn); - return err; - } - - pn->users++; - return 0; + nf_conntrack_generic_init_net(net); + nf_conntrack_udp_init_net(net); + nf_conntrack_tcp_init_net(net); + nf_conntrack_icmp_init_net(net); +#if IS_ENABLED(CONFIG_IPV6) + nf_conntrack_icmpv6_init_net(net); +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP + nf_conntrack_dccp_init_net(net); +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + nf_conntrack_sctp_init_net(net); +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE + nf_conntrack_gre_init_net(net); +#endif } void nf_conntrack_proto_pernet_fini(struct net *net) { - struct nf_proto_net *pn = nf_ct_l4proto_net(net, - &nf_conntrack_l4proto_generic); - - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto, - ARRAY_SIZE(builtin_l4proto)); - pn->users--; - nf_ct_l4proto_unregister_sysctl(pn); +#ifdef CONFIG_NF_CT_PROTO_GRE + nf_ct_gre_keymap_flush(net); +#endif } - module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 023c1445bc39..6fca80587505 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -472,9 +472,10 @@ out_invalid: return true; } -static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; @@ -723,123 +724,28 @@ dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -/* template, data assigned later */ -static struct ctl_table dccp_sysctl_table[] = { - { - .procname = "nf_conntrack_dccp_timeout_request", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_respond", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_partopen", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_open", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_closereq", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_closing", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_timeout_timewait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, - struct nf_dccp_net *dn) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table) - return 0; - - pn->ctl_table = kmemdup(dccp_sysctl_table, - sizeof(dccp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; - pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; - pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; - pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; - pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; - pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; - pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; - pn->ctl_table[7].data = &dn->dccp_loose; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - pn->ctl_table[0].procname = NULL; -#endif - return 0; -} - -static int dccp_init_net(struct net *net) +void nf_conntrack_dccp_init_net(struct net *net) { struct nf_dccp_net *dn = nf_dccp_pernet(net); - struct nf_proto_net *pn = &dn->pn; - - if (!pn->users) { - /* default values */ - dn->dccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; - } - return dccp_kmemdup_sysctl_table(net, pn, dn); -} - -static struct nf_proto_net *dccp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp.pn; + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + + /* timeouts[0] is unused, make it same as SYN_SENT so + * ->timeouts[0] contains 'new' timeout, like udp or icmp. + */ + dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .l4proto = IPPROTO_DCCP, - .packet = dccp_packet, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, @@ -862,6 +768,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .nla_policy = dccp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = dccp_init_net, - .get_net_proto = dccp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 5da19d5fbc76..0f526fafecae 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -15,50 +15,6 @@ static const unsigned int nf_ct_generic_timeout = 600*HZ; -static bool nf_generic_should_process(u8 proto) -{ - switch (proto) { -#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE - case IPPROTO_GRE: - return false; -#endif - default: - return true; - } -} - -static bool generic_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return true; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int generic_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - const unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!nf_generic_should_process(nf_ct_protonum(ct))) { - pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", - nf_ct_protonum(ct)); - return -NF_ACCEPT; - } - - if (!timeout) - timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; - - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - return NF_ACCEPT; -} - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> @@ -104,53 +60,16 @@ generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table generic_sysctl_table[] = { - { - .procname = "nf_conntrack_generic_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_generic_net *gn) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(generic_sysctl_table, - sizeof(generic_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &gn->timeout; -#endif - return 0; -} - -static int generic_init_net(struct net *net) +void nf_conntrack_generic_init_net(struct net *net) { struct nf_generic_net *gn = nf_generic_pernet(net); - struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; - - return generic_kmemdup_sysctl_table(pn, gn); -} - -static struct nf_proto_net *generic_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.generic.pn; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, - .pkt_to_tuple = generic_pkt_to_tuple, - .packet = generic_packet, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, @@ -160,6 +79,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .nla_policy = generic_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = generic_init_net, - .get_net_proto = generic_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 8899b51aad44..ee9ab10a32e4 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -48,24 +48,25 @@ static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_REPLIED] = 180*HZ, }; -static unsigned int proto_gre_net_id __read_mostly; +/* used when expectation is added */ +static DEFINE_SPINLOCK(keymap_lock); -static inline struct netns_proto_gre *gre_pernet(struct net *net) +static inline struct nf_gre_net *gre_pernet(struct net *net) { - return net_generic(net, proto_gre_net_id); + return &net->ct.nf_ct_proto.gre; } -static void nf_ct_gre_keymap_flush(struct net *net) +void nf_ct_gre_keymap_flush(struct net *net) { - struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; - write_lock_bh(&net_gre->keymap_lock); + spin_lock_bh(&keymap_lock); list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { - list_del(&km->list); - kfree(km); + list_del_rcu(&km->list); + kfree_rcu(km, rcu); } - write_unlock_bh(&net_gre->keymap_lock); + spin_unlock_bh(&keymap_lock); } static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, @@ -81,18 +82,16 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, /* look up the source key for a given tuple */ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { - struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km; __be16 key = 0; - read_lock_bh(&net_gre->keymap_lock); - list_for_each_entry(km, &net_gre->keymap_list, list) { + list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t)) { key = km->tuple.src.u.gre.key; break; } } - read_unlock_bh(&net_gre->keymap_lock); pr_debug("lookup src key 0x%x for ", key); nf_ct_dump_tuple(t); @@ -105,21 +104,17 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { struct net *net = nf_ct_net(ct); - struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_ct_gre_keymap **kmp, *km; kmp = &ct_pptp_info->keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ - read_lock_bh(&net_gre->keymap_lock); - list_for_each_entry(km, &net_gre->keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) { - read_unlock_bh(&net_gre->keymap_lock); + list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t) && km == *kmp) return 0; - } } - read_unlock_bh(&net_gre->keymap_lock); pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; @@ -134,9 +129,9 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, pr_debug("adding new entry %p: ", km); nf_ct_dump_tuple(&km->tuple); - write_lock_bh(&net_gre->keymap_lock); + spin_lock_bh(&keymap_lock); list_add_tail(&km->list, &net_gre->keymap_list); - write_unlock_bh(&net_gre->keymap_lock); + spin_unlock_bh(&keymap_lock); return 0; } @@ -145,32 +140,30 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); /* destroy the keymap entries associated with specified master ct */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); enum ip_conntrack_dir dir; pr_debug("entering for ct %p\n", ct); - write_lock_bh(&net_gre->keymap_lock); + spin_lock_bh(&keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { if (ct_pptp_info->keymap[dir]) { pr_debug("removing %p from list\n", ct_pptp_info->keymap[dir]); - list_del(&ct_pptp_info->keymap[dir]->list); - kfree(ct_pptp_info->keymap[dir]); + list_del_rcu(&ct_pptp_info->keymap[dir]->list); + kfree_rcu(ct_pptp_info->keymap[dir], rcu); ct_pptp_info->keymap[dir] = NULL; } } - write_unlock_bh(&net_gre->keymap_lock); + spin_unlock_bh(&keymap_lock); } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ /* gre hdr info to tuple */ -static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) +bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct pptp_gre_header *pgrehdr; struct pptp_gre_header _pgrehdr; @@ -216,15 +209,15 @@ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) static unsigned int *gre_get_timeouts(struct net *net) { - return gre_pernet(net)->gre_timeouts; + return gre_pernet(net)->timeouts; } /* Returns verdict for packet, and may modify conntrack */ -static int gre_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_gre_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { if (state->pf != NFPROTO_IPV4) return -NF_ACCEPT; @@ -256,19 +249,6 @@ static int gre_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a conntrack entry has already been removed from the hashes - * and is about to be deleted from memory */ -static void gre_destroy(struct nf_conn *ct) -{ - struct nf_conn *master = ct->master; - pr_debug(" entering\n"); - - if (!master) - pr_debug("no master !?!\n"); - else - nf_ct_gre_keymap_destroy(master); -} - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> @@ -278,13 +258,13 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct netns_proto_gre *net_gre = gre_pernet(net); + struct nf_gre_net *net_gre = gre_pernet(net); if (!timeouts) timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ - timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; - timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; + timeouts[GRE_CT_UNREPLIED] = net_gre->timeouts[GRE_CT_UNREPLIED]; + timeouts[GRE_CT_REPLIED] = net_gre->timeouts[GRE_CT_REPLIED]; if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { timeouts[GRE_CT_UNREPLIED] = @@ -320,69 +300,22 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table gre_sysctl_table[] = { - { - .procname = "nf_conntrack_gre_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_gre_timeout_stream", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - {} -}; -#endif - -static int gre_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *nf, - struct netns_proto_gre *net_gre) -{ -#ifdef CONFIG_SYSCTL - int i; - - if (nf->ctl_table) - return 0; - - nf->ctl_table = kmemdup(gre_sysctl_table, - sizeof(gre_sysctl_table), - GFP_KERNEL); - if (!nf->ctl_table) - return -ENOMEM; - - for (i = 0; i < GRE_CT_MAX; i++) - nf->ctl_table[i].data = &net_gre->gre_timeouts[i]; -#endif - return 0; -} - -static int gre_init_net(struct net *net) +void nf_conntrack_gre_init_net(struct net *net) { - struct netns_proto_gre *net_gre = gre_pernet(net); - struct nf_proto_net *nf = &net_gre->nf; + struct nf_gre_net *net_gre = gre_pernet(net); int i; - rwlock_init(&net_gre->keymap_lock); INIT_LIST_HEAD(&net_gre->keymap_list); for (i = 0; i < GRE_CT_MAX; i++) - net_gre->gre_timeouts[i] = gre_timeouts[i]; - - return gre_kmemdup_sysctl_table(net, nf, net_gre); + net_gre->timeouts[i] = gre_timeouts[i]; } /* protocol helper struct */ -static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { +const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, - .pkt_to_tuple = gre_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif - .packet = gre_packet, - .destroy = gre_destroy, - .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, @@ -398,61 +331,4 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .nla_policy = gre_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .net_id = &proto_gre_net_id, - .init_net = gre_init_net, }; - -static int proto_gre_net_init(struct net *net) -{ - int ret = 0; - - ret = nf_ct_l4proto_pernet_register_one(net, - &nf_conntrack_l4proto_gre4); - if (ret < 0) - pr_err("nf_conntrack_gre4: pernet registration failed.\n"); - return ret; -} - -static void proto_gre_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister_one(net, &nf_conntrack_l4proto_gre4); - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations proto_gre_net_ops = { - .init = proto_gre_net_init, - .exit = proto_gre_net_exit, - .id = &proto_gre_net_id, - .size = sizeof(struct netns_proto_gre), -}; - -static int __init nf_ct_proto_gre_init(void) -{ - int ret; - - BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0); - - ret = register_pernet_subsys(&proto_gre_net_ops); - if (ret < 0) - goto out_pernet; - ret = nf_ct_l4proto_register_one(&nf_conntrack_l4proto_gre4); - if (ret < 0) - goto out_gre4; - - return 0; -out_gre4: - unregister_pernet_subsys(&proto_gre_net_ops); -out_pernet: - return ret; -} - -static void __exit nf_ct_proto_gre_fini(void) -{ - nf_ct_l4proto_unregister_one(&nf_conntrack_l4proto_gre4); - unregister_pernet_subsys(&proto_gre_net_ops); -} - -module_init(nf_ct_proto_gre_init); -module_exit(nf_ct_proto_gre_fini); - -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index de64d8a5fdfd..7df477996b16 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -25,8 +25,8 @@ static const unsigned int nf_ct_icmp_timeout = 30*HZ; -static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) +bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; @@ -54,8 +54,8 @@ static const u_int8_t invmap[] = { [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; -static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) @@ -68,11 +68,10 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, } /* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_icmp_packet(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic @@ -110,7 +109,6 @@ icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conntrack_tuple innertuple, origtuple; - const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; @@ -128,12 +126,9 @@ icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum); - /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + if (!nf_ct_invert_tuple(&innertuple, &origtuple)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } @@ -303,56 +298,16 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table icmp_sysctl_table[] = { - { - .procname = "nf_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmp_sysctl_table, - sizeof(icmp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmp_init_net(struct net *net) +void nf_conntrack_icmp_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmp_pernet(net); - struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; - - return icmp_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp.pn; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .packet = icmp_packet, - .destroy = NULL, - .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, @@ -368,6 +323,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = icmp_init_net, - .get_net_proto = icmp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index a15eefb8e317..bec4a3211658 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -30,10 +30,10 @@ static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; -static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) +bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct net *net, + struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; @@ -67,8 +67,8 @@ static const u_int8_t noct_valid_new[] = { [ICMPV6_MLD2_REPORT - 130] = 1 }; -static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) @@ -86,11 +86,10 @@ static unsigned int *icmpv6_get_timeouts(struct net *net) } /* Returns verdict for packet, or -1 for invalid. */ -static int icmpv6_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_icmpv6_packet(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); static const u8 valid_new[] = { @@ -131,7 +130,6 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, { struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_l4proto *inproto; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; @@ -147,12 +145,9 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_thresh */ - inproto = __nf_ct_l4proto_find(origtuple.dst.protonum); - /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { + if (!nf_ct_invert_tuple(&intuple, &origtuple)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } @@ -314,54 +309,16 @@ icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table icmpv6_sysctl_table[] = { - { - .procname = "nf_conntrack_icmpv6_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmpv6_sysctl_table, - sizeof(icmpv6_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmpv6_init_net(struct net *net) +void nf_conntrack_icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = nf_icmpv6_pernet(net); - struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmpv6_timeout; - - return icmpv6_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6.pn; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, - .pkt_to_tuple = icmpv6_pkt_to_tuple, - .invert_tuple = icmpv6_invert_tuple, - .packet = icmpv6_packet, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, @@ -377,6 +334,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .nla_policy = icmpv6_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = icmpv6_init_net, - .get_net_proto = icmpv6_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index d53e3e78f605..a7818101ad80 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -357,11 +357,11 @@ out_invalid: } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ -static int sctp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_sctp_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -642,116 +642,18 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table sctp_sysctl_table[] = { - { - .procname = "nf_conntrack_sctp_timeout_closed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_cookie_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_cookie_echoed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_shutdown_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_shutdown_recd", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif - -static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_sctp_net *sn) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table) - return 0; - - pn->ctl_table = kmemdup(sctp_sysctl_table, - sizeof(sctp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; - pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; - pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; - pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; - pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; - pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; - pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; - pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT]; - pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED]; -#endif - return 0; -} - -static int sctp_init_net(struct net *net) +void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); - struct nf_proto_net *pn = &sn->pn; - - if (!pn->users) { - int i; - - for (i = 0; i < SCTP_CONNTRACK_MAX; i++) - sn->timeouts[i] = sctp_timeouts[i]; - - /* timeouts[0] is unused, init it so ->timeouts[0] contains - * 'new' timeout, like udp or icmp. - */ - sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; - } + int i; - return sctp_kmemdup_sysctl_table(pn, sn); -} + for (i = 0; i < SCTP_CONNTRACK_MAX; i++) + sn->timeouts[i] = sctp_timeouts[i]; -static struct nf_proto_net *sctp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.sctp.pn; + /* timeouts[0] is unused, init it so ->timeouts[0] contains + * 'new' timeout, like udp or icmp. + */ + sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { @@ -759,9 +661,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif - .packet = sctp_packet, .can_early_drop = sctp_can_early_drop, - .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, @@ -780,6 +680,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = sctp_init_net, - .get_net_proto = sctp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 4dcbd51a8e97..01c748fa8913 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -829,11 +829,11 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, } /* Returns verdict for packet, or -1 for invalid. */ -static int tcp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_tcp_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); @@ -1387,146 +1387,21 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table tcp_sysctl_table[] = { - { - .procname = "nf_conntrack_tcp_timeout_syn_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_syn_recv", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_fin_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_close_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_last_ack", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_time_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_close", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_timeout_unacknowledged", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_tcp_loose", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nf_conntrack_tcp_be_liberal", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nf_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_tcp_net *tn) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table) - return 0; - - pn->ctl_table = kmemdup(tcp_sysctl_table, - sizeof(tcp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; - pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; - pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; - pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; - pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; - pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; - pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; - pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; - pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; - pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK]; - pn->ctl_table[10].data = &tn->tcp_loose; - pn->ctl_table[11].data = &tn->tcp_be_liberal; - pn->ctl_table[12].data = &tn->tcp_max_retrans; -#endif - return 0; -} - -static int tcp_init_net(struct net *net) +void nf_conntrack_tcp_init_net(struct net *net) { struct nf_tcp_net *tn = nf_tcp_pernet(net); - struct nf_proto_net *pn = &tn->pn; - - if (!pn->users) { - int i; - - for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) - tn->timeouts[i] = tcp_timeouts[i]; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; - tn->tcp_loose = nf_ct_tcp_loose; - tn->tcp_be_liberal = nf_ct_tcp_be_liberal; - tn->tcp_max_retrans = nf_ct_tcp_max_retrans; - } + int i; - return tcp_kmemdup_sysctl_table(pn, tn); -} + for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) + tn->timeouts[i] = tcp_timeouts[i]; -static struct nf_proto_net *tcp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.tcp.pn; + /* timeouts[0] is unused, make it same as SYN_SENT so + * ->timeouts[0] contains 'new' timeout, like udp or icmp. + */ + tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; + tn->tcp_loose = nf_ct_tcp_loose; + tn->tcp_be_liberal = nf_ct_tcp_be_liberal; + tn->tcp_max_retrans = nf_ct_tcp_max_retrans; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = @@ -1535,7 +1410,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif - .packet = tcp_packet, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, @@ -1556,6 +1430,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = tcp_init_net, - .get_net_proto = tcp_get_net_proto, }; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index b4f5d5e82031..951366dfbec3 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -85,11 +85,11 @@ static bool udp_error(struct sk_buff *skb, } /* Returns verdict for packet, and may modify conntracktype */ -static int udp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_udp_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeouts; @@ -177,11 +177,11 @@ static bool udplite_error(struct sk_buff *skb, } /* Returns verdict for packet, and may modify conntracktype */ -static int udplite_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) +int nf_conntrack_udplite_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeouts; @@ -260,66 +260,19 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -#ifdef CONFIG_SYSCTL -static struct ctl_table udp_sysctl_table[] = { - { - .procname = "nf_conntrack_udp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_udp_timeout_stream", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_udp_net *un) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table) - return 0; - pn->ctl_table = kmemdup(udp_sysctl_table, - sizeof(udp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; - pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED]; -#endif - return 0; -} - -static int udp_init_net(struct net *net) +void nf_conntrack_udp_init_net(struct net *net) { struct nf_udp_net *un = nf_udp_pernet(net); - struct nf_proto_net *pn = &un->pn; + int i; - if (!pn->users) { - int i; - - for (i = 0; i < UDP_CT_MAX; i++) - un->timeouts[i] = udp_timeouts[i]; - } - - return udp_kmemdup_sysctl_table(pn, un); -} - -static struct nf_proto_net *udp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.udp.pn; + for (i = 0; i < UDP_CT_MAX; i++) + un->timeouts[i] = udp_timeouts[i]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { .l4proto = IPPROTO_UDP, .allow_clash = true, - .packet = udp_packet, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -335,8 +288,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, }; #ifdef CONFIG_NF_CT_PROTO_UDPLITE @@ -344,7 +295,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .packet = udplite_packet, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -360,7 +310,5 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, }; #endif diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b6177fd73304..c2ae14c720b4 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -24,6 +24,10 @@ #include <net/netfilter/nf_conntrack_timestamp.h> #include <linux/rculist_nulls.h> +static bool enable_hooks __read_mostly; +MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks"); +module_param(enable_hooks, bool, 0000); + unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS @@ -310,8 +314,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); - WARN_ON(!l4proto); + l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", @@ -547,8 +550,55 @@ enum nf_ct_sysctl_index { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_SYSCTL_CT_TIMESTAMP, #endif + NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, + NF_SYSCTL_CT_PROTO_TCP_LOOSE, + NF_SYSCTL_CT_PROTO_TCP_LIBERAL, + NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, + NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, + NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, +#ifdef CONFIG_NF_CT_PROTO_SCTP + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, + NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, + NF_SYSCTL_CT_PROTO_DCCP_LOOSE, +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE + NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, + NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, +#endif + + __NF_SYSCTL_CT_LAST_SYSCTL, }; +#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) + static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", @@ -626,7 +676,235 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, #endif - { } + [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { + .procname = "nf_conntrack_generic_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = { + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = { + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = { + .procname = "nf_conntrack_tcp_timeout_established", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = { + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = { + .procname = "nf_conntrack_tcp_timeout_close_wait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = { + .procname = "nf_conntrack_tcp_timeout_last_ack", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = { + .procname = "nf_conntrack_tcp_timeout_time_wait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = { + .procname = "nf_conntrack_tcp_timeout_close", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = { + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { + .procname = "nf_conntrack_tcp_loose", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { + .procname = "nf_conntrack_tcp_be_liberal", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { + .procname = "nf_conntrack_tcp_max_retrans", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { + .procname = "nf_conntrack_udp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = { + .procname = "nf_conntrack_udp_timeout_stream", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { + .procname = "nf_conntrack_icmp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = { + .procname = "nf_conntrack_icmpv6_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#ifdef CONFIG_NF_CT_PROTO_SCTP + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = { + .procname = "nf_conntrack_sctp_timeout_closed", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = { + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = { + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = { + .procname = "nf_conntrack_sctp_timeout_established", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = { + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = { + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = { + .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { + .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { + .procname = "nf_conntrack_dccp_timeout_request", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { + .procname = "nf_conntrack_dccp_timeout_respond", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { + .procname = "nf_conntrack_dccp_timeout_partopen", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { + .procname = "nf_conntrack_dccp_timeout_open", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { + .procname = "nf_conntrack_dccp_timeout_closereq", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { + .procname = "nf_conntrack_dccp_timeout_closing", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { + .procname = "nf_conntrack_dccp_timeout_timewait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { + .procname = "nf_conntrack_dccp_loose", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE + [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { + .procname = "nf_conntrack_gre_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = { + .procname = "nf_conntrack_gre_timeout_stream", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif + {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -640,14 +918,103 @@ static struct ctl_table nf_ct_netfilter_table[] = { { } }; +static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, + struct ctl_table *table) +{ + struct nf_tcp_net *tn = nf_tcp_pernet(net); + +#define XASSIGN(XNAME, tn) \ + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \ + &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME] + + XASSIGN(SYN_SENT, tn); + XASSIGN(SYN_RECV, tn); + XASSIGN(ESTABLISHED, tn); + XASSIGN(FIN_WAIT, tn); + XASSIGN(CLOSE_WAIT, tn); + XASSIGN(LAST_ACK, tn); + XASSIGN(TIME_WAIT, tn); + XASSIGN(CLOSE, tn); + XASSIGN(RETRANS, tn); + XASSIGN(UNACK, tn); +#undef XASSIGN +#define XASSIGN(XNAME, rval) \ + table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval) + + XASSIGN(LOOSE, &tn->tcp_loose); + XASSIGN(LIBERAL, &tn->tcp_be_liberal); + XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); +#undef XASSIGN +} + +static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, + struct ctl_table *table) +{ +#ifdef CONFIG_NF_CT_PROTO_SCTP + struct nf_sctp_net *sn = nf_sctp_pernet(net); + +#define XASSIGN(XNAME, sn) \ + table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \ + &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME] + + XASSIGN(CLOSED, sn); + XASSIGN(COOKIE_WAIT, sn); + XASSIGN(COOKIE_ECHOED, sn); + XASSIGN(ESTABLISHED, sn); + XASSIGN(SHUTDOWN_SENT, sn); + XASSIGN(SHUTDOWN_RECD, sn); + XASSIGN(SHUTDOWN_ACK_SENT, sn); + XASSIGN(HEARTBEAT_SENT, sn); + XASSIGN(HEARTBEAT_ACKED, sn); +#undef XASSIGN +#endif +} + +static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, + struct ctl_table *table) +{ +#ifdef CONFIG_NF_CT_PROTO_DCCP + struct nf_dccp_net *dn = nf_dccp_pernet(net); + +#define XASSIGN(XNAME, dn) \ + table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ + &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] + + XASSIGN(REQUEST, dn); + XASSIGN(RESPOND, dn); + XASSIGN(PARTOPEN, dn); + XASSIGN(OPEN, dn); + XASSIGN(CLOSEREQ, dn); + XASSIGN(CLOSING, dn); + XASSIGN(TIMEWAIT, dn); +#undef XASSIGN + + table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; +#endif +} + +static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, + struct ctl_table *table) +{ +#ifdef CONFIG_NF_CT_PROTO_GRE + struct nf_gre_net *gn = nf_gre_pernet(net); + + table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED]; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED]; +#endif +} + static int nf_conntrack_standalone_init_sysctl(struct net *net) { + struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; + BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); + table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) - goto out_kmemdup; + return -ENOMEM; table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; @@ -655,6 +1022,16 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif + table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; + + nf_conntrack_standalone_init_tcp_sysctl(net, table); + nf_conntrack_standalone_init_sctp_sysctl(net, table); + nf_conntrack_standalone_init_dccp_sysctl(net, table); + nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { @@ -680,7 +1057,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) out_unregister_netfilter: kfree(table); -out_kmemdup: return -ENOMEM; } @@ -703,31 +1079,47 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net) } #endif /* CONFIG_SYSCTL */ +static void nf_conntrack_fini_net(struct net *net) +{ + if (enable_hooks) + nf_ct_netns_put(net, NFPROTO_INET); + + nf_conntrack_standalone_fini_proc(net); + nf_conntrack_standalone_fini_sysctl(net); +} + static int nf_conntrack_pernet_init(struct net *net) { int ret; - ret = nf_conntrack_init_net(net); + net->ct.sysctl_checksum = 1; + + ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) - goto out_init; + return ret; ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; - net->ct.sysctl_checksum = 1; - net->ct.sysctl_log_invalid = 0; - ret = nf_conntrack_standalone_init_sysctl(net); + ret = nf_conntrack_init_net(net); if (ret < 0) - goto out_sysctl; + goto out_init_net; + + if (enable_hooks) { + ret = nf_ct_netns_get(net, NFPROTO_INET); + if (ret < 0) + goto out_hooks; + } return 0; -out_sysctl: +out_hooks: + nf_conntrack_cleanup_net(net); +out_init_net: nf_conntrack_standalone_fini_proc(net); out_proc: - nf_conntrack_cleanup_net(net); -out_init: + nf_conntrack_standalone_fini_sysctl(net); return ret; } @@ -735,10 +1127,9 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { struct net *net; - list_for_each_entry(net, net_exit_list, exit_list) { - nf_conntrack_standalone_fini_sysctl(net); - nf_conntrack_standalone_fini_proc(net); - } + list_for_each_entry(net, net_exit_list, exit_list) + nf_conntrack_fini_net(net); + nf_conntrack_cleanup_net_list(net_exit_list); } diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index c0c72ae9df42..7aabfd4b1e50 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -121,7 +121,7 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) if (l4num == IPPROTO_TCP) flow_offload_fixup_tcp(&ct->proto.tcp); - l4proto = __nf_ct_l4proto_find(l4num); + l4proto = nf_ct_l4proto_find(l4num); if (!l4proto) return; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index d159e9e7835b..35e61038ae96 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -146,7 +146,7 @@ hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) } /* Is this tuple already taken? (not by us) */ -int +static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { @@ -158,10 +158,9 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, */ struct nf_conntrack_tuple reply; - nf_ct_invert_tuplepr(&reply, tuple); + nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } -EXPORT_SYMBOL(nf_nat_used_tuple); static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) @@ -253,7 +252,7 @@ find_appropriate_src(struct net *net, net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ - nf_ct_invert_tuplepr(result, + nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; @@ -560,8 +559,8 @@ nf_nat_setup_info(struct nf_conn *ct, * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ - nf_ct_invert_tuplepr(&curr_tuple, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + nf_ct_invert_tuple(&curr_tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); @@ -569,7 +568,7 @@ nf_nat_setup_info(struct nf_conn *ct, struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ - nf_ct_invert_tuplepr(&reply, &new_tuple); + nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ @@ -640,7 +639,7 @@ static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); l3proto = __nf_nat_l3proto_find(target.src.l3num); if (!l3proto->manip_pkt(skb, 0, &target, mtype)) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fb07f6cfc719..e92bedd09cde 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -37,10 +37,16 @@ enum { NFT_VALIDATE_DO, }; +static struct rhltable nft_objname_ht; + static u32 nft_chain_hash(const void *data, u32 len, u32 seed); static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed); static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *); +static u32 nft_objname_hash(const void *data, u32 len, u32 seed); +static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed); +static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *); + static const struct rhashtable_params nft_chain_ht_params = { .head_offset = offsetof(struct nft_chain, rhlhead), .key_offset = offsetof(struct nft_chain, name), @@ -51,6 +57,15 @@ static const struct rhashtable_params nft_chain_ht_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params nft_objname_ht_params = { + .head_offset = offsetof(struct nft_object, rhlhead), + .key_offset = offsetof(struct nft_object, key), + .hashfn = nft_objname_hash, + .obj_hashfn = nft_objname_hash_obj, + .obj_cmpfn = nft_objname_hash_cmp, + .automatic_shrinking = true, +}; + static void nft_validate_state_update(struct net *net, u8 new_validate_state) { switch (net->nft.validate_state) { @@ -814,6 +829,34 @@ static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(chain->name, name); } +static u32 nft_objname_hash(const void *data, u32 len, u32 seed) +{ + const struct nft_object_hash_key *k = data; + + seed ^= hash_ptr(k->table, 32); + + return jhash(k->name, strlen(k->name), seed); +} + +static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct nft_object *obj = data; + + return nft_objname_hash(&obj->key, 0, seed); +} + +static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct nft_object_hash_key *k = arg->key; + const struct nft_object *obj = ptr; + + if (obj->key.table != k->table) + return -1; + + return strcmp(obj->key.name, k->name); +} + static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -1070,7 +1113,7 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) return ERR_PTR(-ENOENT); } -static bool lockdep_commit_lock_is_held(struct net *net) +static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING return lockdep_is_held(&net->nft.commit_mutex); @@ -2565,6 +2608,9 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nlattr *nla); + #define NFT_RULE_MAXEXPRS 128 static int nf_tables_newrule(struct net *net, struct sock *nlsk, @@ -2634,6 +2680,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); } + } else if (nla[NFTA_RULE_POSITION_ID]) { + old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); + return PTR_ERR(old_rule); + } } } @@ -3851,7 +3903,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && nla_put_string(skb, NFTA_SET_ELEM_OBJREF, - (*nft_set_ext_obj(ext))->name) < 0) + (*nft_set_ext_obj(ext))->key.name) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && @@ -4384,7 +4436,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EINVAL; goto err2; } - obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + obj = nft_obj_lookup(ctx->net, ctx->table, + nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); @@ -4819,18 +4872,36 @@ void nft_unregister_obj(struct nft_object_type *obj_type) } EXPORT_SYMBOL_GPL(nft_unregister_obj); -struct nft_object *nft_obj_lookup(const struct nft_table *table, +struct nft_object *nft_obj_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask) { + struct nft_object_hash_key k = { .table = table }; + char search[NFT_OBJ_MAXNAMELEN]; + struct rhlist_head *tmp, *list; struct nft_object *obj; - list_for_each_entry_rcu(obj, &table->objects, list) { - if (!nla_strcmp(nla, obj->name) && - objtype == obj->ops->type->type && - nft_active_genmask(obj, genmask)) + nla_strlcpy(search, nla, sizeof(search)); + k.name = search; + + WARN_ON_ONCE(!rcu_read_lock_held() && + !lockdep_commit_lock_is_held(net)); + + rcu_read_lock(); + list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params); + if (!list) + goto out; + + rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) { + if (objtype == obj->ops->type->type && + nft_active_genmask(obj, genmask)) { + rcu_read_unlock(); return obj; + } } +out: + rcu_read_unlock(); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(nft_obj_lookup); @@ -4988,7 +5059,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); if (err != -ENOENT) { @@ -5014,11 +5085,11 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, err = PTR_ERR(obj); goto err1; } - obj->table = table; + obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); - obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); - if (!obj->name) { + obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); + if (!obj->key.name) { err = -ENOMEM; goto err2; } @@ -5027,11 +5098,20 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, if (err < 0) goto err3; + err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, + nft_objname_ht_params); + if (err < 0) + goto err4; + list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; +err4: + /* queued in transaction log */ + INIT_LIST_HEAD(&obj->list); + return err; err3: - kfree(obj->name); + kfree(obj->key.name); err2: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); @@ -5060,7 +5140,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || - nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || + nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) || @@ -5215,7 +5295,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return PTR_ERR(obj); @@ -5246,7 +5326,7 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) obj->ops->destroy(ctx, obj); module_put(obj->ops->type->owner); - kfree(obj->name); + kfree(obj->key.name); kfree(obj); } @@ -5280,7 +5360,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask); } else { attr = nla[NFTA_OBJ_NAME]; - obj = nft_obj_lookup(table, attr, objtype, genmask); + obj = nft_obj_lookup(net, table, attr, objtype, genmask); } if (IS_ERR(obj)) { @@ -5297,7 +5377,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return nft_delobj(&ctx, obj); } -void nft_obj_notify(struct net *net, struct nft_table *table, +void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, int family, int report, gfp_t gfp) { @@ -6404,6 +6484,12 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) nf_tables_commit_chain_free_rules_old(g0); } +static void nft_obj_del(struct nft_object *obj) +{ + rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); + list_del_rcu(&obj->list); +} + static void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -6580,7 +6666,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELOBJ: - list_del_rcu(&nft_trans_obj(trans)->list); + nft_obj_del(nft_trans_obj(trans)); nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), NFT_MSG_DELOBJ); break; @@ -6716,7 +6802,7 @@ static int __nf_tables_abort(struct net *net) break; case NFT_MSG_NEWOBJ: trans->ctx.table->use--; - list_del_rcu(&nft_trans_obj(trans)->list); + nft_obj_del(nft_trans_obj(trans)); break; case NFT_MSG_DELOBJ: trans->ctx.table->use++; @@ -7330,7 +7416,7 @@ static void __nft_release_tables(struct net *net) nft_set_destroy(set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { - list_del(&obj->list); + nft_obj_del(obj); table->use--; nft_obj_destroy(&ctx, obj); } @@ -7392,12 +7478,18 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err3; + err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params); + if (err < 0) + goto err4; + /* must be last */ err = nfnetlink_subsys_register(&nf_tables_subsys); if (err < 0) - goto err4; + goto err5; return err; +err5: + rhltable_destroy(&nft_objname_ht); err4: unregister_netdevice_notifier(&nf_tables_flowtable_notifier); err3: @@ -7417,6 +7509,7 @@ static void __exit nf_tables_module_exit(void) unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_destroy_work); rcu_barrier(); + rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a50500232b0a..2a00aef7b6d4 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -124,14 +124,25 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { +#ifdef CONFIG_RETPOLINE unsigned long e = (unsigned long)expr->ops->eval; - - if (e == (unsigned long)nft_meta_get_eval) - nft_meta_get_eval(expr, regs, pkt); - else if (e == (unsigned long)nft_lookup_eval) - nft_lookup_eval(expr, regs, pkt); - else - expr->ops->eval(expr, regs, pkt); +#define X(e, fun) \ + do { if ((e) == (unsigned long)(fun)) \ + return fun(expr, regs, pkt); } while (0) + + X(e, nft_payload_eval); + X(e, nft_cmp_eval); + X(e, nft_meta_get_eval); + X(e, nft_lookup_eval); + X(e, nft_range_eval); + X(e, nft_immediate_eval); + X(e, nft_byteorder_eval); + X(e, nft_dynset_eval); + X(e, nft_rt_get_eval); + X(e, nft_bitwise_eval); +#undef X +#endif /* CONFIG_RETPOLINE */ + expr->ops->eval(expr, regs, pkt); } unsigned int diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 109b0d27345a..c69b11ca5aad 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -122,7 +122,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return -EBUSY; } - l4proto = nf_ct_l4proto_find_get(l4num); + l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { @@ -152,7 +152,6 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, err: kfree(timeout); err_proto_put: - nf_ct_l4proto_put(l4proto); return ret; } @@ -302,7 +301,6 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); - nf_ct_l4proto_put(timeout->timeout.l4proto); nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { @@ -359,7 +357,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, return -EINVAL; l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l4num); + l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { @@ -372,10 +370,8 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, if (ret < 0) goto err; - nf_ct_l4proto_put(l4proto); return 0; err: - nf_ct_l4proto_put(l4proto); return ret; } @@ -442,7 +438,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l4num); + l4proto = nf_ct_l4proto_find(l4num); err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) @@ -474,12 +470,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE - if (l4proto->net_id) { - struct netns_proto_gre *net_gre; - - net_gre = net_generic(net, *l4proto->net_id); - timeouts = net_gre->gre_timeouts; - } + timeouts = nf_gre_pernet(net)->timeouts; #endif break; case 255: @@ -516,7 +507,6 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, /* this avoids a loop in nfnetlink. */ return ret == -EAGAIN ? -ENOBUFS : ret; err: - nf_ct_l4proto_put(l4proto); return err; } @@ -597,7 +587,6 @@ static void __net_exit cttimeout_net_exit(struct net *net) list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); - nf_ct_l4proto_put(cur->timeout.l4proto); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index fff8073e2a56..2c75b9e0474e 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -25,9 +25,8 @@ struct nft_bitwise { struct nft_data xor; }; -static void nft_bitwise_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_bitwise_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 13d4e421a6b3..19dbc34cc75e 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -26,9 +26,9 @@ struct nft_byteorder { u8 size; }; -static void nft_byteorder_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_byteorder_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 79d48c1d06f4..f9f1fa66a16e 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -24,9 +24,9 @@ struct nft_cmp_expr { enum nft_cmp_ops op:8; }; -static void nft_cmp_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_cmp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); int d; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index a61d7edfc290..1a6b06ce6b5b 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -104,7 +104,7 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static void nft_counter_reset(struct nft_counter_percpu_priv __percpu *priv, +static void nft_counter_reset(struct nft_counter_percpu_priv *priv, struct nft_counter *total) { struct nft_counter *this_cpu; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 586627c361df..7b717fad6cdc 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -870,7 +870,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; - l4proto = nf_ct_l4proto_find_get(l4num); + l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; @@ -902,7 +902,6 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, err_free_timeout: kfree(timeout); err_proto_put: - nf_ct_l4proto_put(l4proto); return ret; } @@ -913,7 +912,6 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nf_ct_timeout *timeout = priv->timeout; nf_ct_untimeout(ctx->net, timeout); - nf_ct_l4proto_put(timeout->l4proto); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 07d4efd3d851..9658493d37d4 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -62,9 +62,8 @@ err1: return NULL; } -static void nft_dynset_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_dynset_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c2d237144f74..ea658e6c53e3 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -25,7 +25,6 @@ struct nft_jhash { u32 modulus; u32 seed; u32 offset; - struct nft_set *map; }; static void nft_jhash_eval(const struct nft_expr *expr, @@ -42,33 +41,10 @@ static void nft_jhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } -static void nft_jhash_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - const void *data = ®s->data[priv->sreg]; - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = reciprocal_scale(jhash(data, priv->len, priv->seed), - priv->modulus) + priv->offset; - - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - struct nft_symhash { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static void nft_symhash_eval(const struct nft_expr *expr, @@ -84,28 +60,6 @@ static void nft_symhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } -static void nft_symhash_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_symhash *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = reciprocal_scale(__skb_get_hash_symmetric(skb), - priv->modulus) + priv->offset; - - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, @@ -114,9 +68,6 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, [NFTA_HASH_TYPE] = { .type = NLA_U32 }, - [NFTA_HASH_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_HASH_SET_ID] = { .type = NLA_U32 }, }; static int nft_jhash_init(const struct nft_ctx *ctx, @@ -166,20 +117,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_jhash_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_jhash_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_HASH_SET_NAME], - tb[NFTA_HASH_SET_ID], genmask); - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_symhash_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -206,20 +143,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_symhash_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_symhash_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_HASH_SET_NAME], - tb[NFTA_HASH_SET_ID], genmask); - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_jhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -247,18 +170,6 @@ nla_put_failure: return -1; } -static int nft_jhash_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_jhash *priv = nft_expr_priv(expr); - - if (nft_jhash_dump(skb, expr) || - nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) - return -1; - - return 0; -} - static int nft_symhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -279,18 +190,6 @@ nla_put_failure: return -1; } -static int nft_symhash_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_symhash *priv = nft_expr_priv(expr); - - if (nft_symhash_dump(skb, expr) || - nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) - return -1; - - return 0; -} - static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, @@ -300,14 +199,6 @@ static const struct nft_expr_ops nft_jhash_ops = { .dump = nft_jhash_dump, }; -static const struct nft_expr_ops nft_jhash_map_ops = { - .type = &nft_hash_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)), - .eval = nft_jhash_map_eval, - .init = nft_jhash_map_init, - .dump = nft_jhash_map_dump, -}; - static const struct nft_expr_ops nft_symhash_ops = { .type = &nft_hash_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), @@ -316,14 +207,6 @@ static const struct nft_expr_ops nft_symhash_ops = { .dump = nft_symhash_dump, }; -static const struct nft_expr_ops nft_symhash_map_ops = { - .type = &nft_hash_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), - .eval = nft_symhash_map_eval, - .init = nft_symhash_map_init, - .dump = nft_symhash_map_dump, -}; - static const struct nft_expr_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -336,12 +219,8 @@ nft_hash_select_ops(const struct nft_ctx *ctx, type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); switch (type) { case NFT_HASH_SYM: - if (tb[NFTA_HASH_SET_NAME]) - return &nft_symhash_map_ops; return &nft_symhash_ops; case NFT_HASH_JENKINS: - if (tb[NFTA_HASH_SET_NAME]) - return &nft_jhash_map_ops; return &nft_jhash_ops; default: break; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 0777a93211e2..3e5ed787b1d4 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -17,9 +17,9 @@ #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> -static void nft_immediate_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_immediate_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6df486c5ebd3..987d2d6ce624 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -244,6 +244,16 @@ void nft_meta_get_eval(const struct nft_expr *expr, strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); return; #endif + case NFT_META_IIFKIND: + if (in == NULL || in->rtnl_link_ops == NULL) + goto err; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (out == NULL || out->rtnl_link_ops == NULL) + goto err; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; default: WARN_ON(1); goto err; @@ -340,6 +350,8 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, break; case NFT_META_IIFNAME: case NFT_META_OIFNAME: + case NFT_META_IIFKIND: + case NFT_META_OIFKIND: len = IFNAMSIZ; break; case NFT_META_PRANDOM: diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index a3185ca2a3a9..c1f2adf198a0 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -38,7 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); - obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + obj = nft_obj_lookup(ctx->net, ctx->table, + tb[NFTA_OBJREF_IMM_NAME], objtype, genmask); if (IS_ERR(obj)) return -ENOENT; @@ -53,7 +54,7 @@ static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_object *obj = nft_objref_priv(expr); - if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) || + if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->ops->type->type))) goto nla_put_failure; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index e110b0ebbf58..54e15de4b79a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -70,9 +70,9 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } -static void nft_payload_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_payload_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0ed124a93fcf..354cde67bca9 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -61,7 +61,7 @@ static void nft_quota_obj_eval(struct nft_object *obj, if (overquota && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) - nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0, + nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); } diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index cedb96c3619f..529ac8acb19d 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -23,9 +23,8 @@ struct nft_range_expr { enum nft_range_ops op:8; }; -static void nft_range_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_range_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_range_expr *priv = nft_expr_priv(expr); int d1, d2; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index f35fa33913ae..c48daed5c46b 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -53,9 +53,9 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb return mtu - minlen; } -static void nft_rt_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_rt_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_rt *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index e8da9a9bba73..55af9f247993 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -180,6 +180,25 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, } EXPORT_SYMBOL_GPL(nf_route); +static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +{ +#ifdef CONFIG_INET + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) + return ip_route_me_harder(entry->state.net, skb, + RTN_UNSPEC); + } +#endif + return 0; +} + int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 2c7a4b80206f..0fa863f57575 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, /* Make sure the timeout policy matches any existing protocol tracker, * otherwise default to generic. */ - l4proto = __nf_ct_l4proto_find(proto); + l4proto = nf_ct_l4proto_find(proto); if (timeout->l4proto->l4proto != l4proto->l4proto) { ret = -EINVAL; pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 4034d70bff39..b2e39cb6a590 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -96,8 +96,7 @@ match_outdev: static int physdev_mt_check(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; - - br_netfilter_enable(); + static bool brnf_probed __read_mostly; if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) @@ -111,6 +110,12 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return -EINVAL; } + + if (!brnf_probed) { + brnf_probed = true; + request_module("br_netfilter"); + } + return 0; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3c023d6120f6..8fa35df94c07 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1371,6 +1371,14 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) } EXPORT_SYMBOL_GPL(netlink_has_listeners); +bool netlink_strict_get_check(struct sk_buff *skb) +{ + const struct netlink_sock *nlk = nlk_sk(NETLINK_CB(skb).sk); + + return nlk->flags & NETLINK_F_STRICT_CHK; +} +EXPORT_SYMBOL_GPL(netlink_strict_get_check); + static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cd94f925495a..35884f836260 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -622,7 +622,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, if (natted) { struct nf_conntrack_tuple inverse; - if (!nf_ct_invert_tuplepr(&inverse, &tuple)) { + if (!nf_ct_invert_tuple(&inverse, &tuple)) { pr_debug("ovs_ct_find_existing: Inversion failed!\n"); return NULL; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index c038e021a591..43849d752a1e 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -206,8 +206,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ - meter = kzalloc(n_bands * sizeof(struct dp_meter_band) + - sizeof(*meter), GFP_KERNEL); + meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL); if (!meter) return ERR_PTR(-ENOMEM); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 6a5dce8baf19..4a57fec6f306 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -18,6 +18,7 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/idr.h> +#include <linux/percpu.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -35,6 +36,7 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; + struct tc_basic_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -46,8 +48,10 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct basic_filter *f; list_for_each_entry_rcu(f, &head->flist, link) { + __this_cpu_inc(f->pf->rcnt); if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; + __this_cpu_inc(f->pf->rhit); *res = f->res; r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) @@ -89,6 +93,7 @@ static void __basic_delete_filter(struct basic_filter *f) tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); + free_percpu(f->pf); kfree(f); } @@ -208,6 +213,11 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; fnew->handle = handle; + fnew->pf = alloc_percpu(struct tc_basic_pcnt); + if (!fnew->pf) { + err = -ENOMEM; + goto errout; + } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, extack); @@ -231,6 +241,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + free_percpu(fnew->pf); tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; @@ -265,8 +276,10 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl) static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { + struct tc_basic_pcnt gpf = {}; struct basic_filter *f = fh; struct nlattr *nest; + int cpu; if (f == NULL) return skb->len; @@ -281,6 +294,18 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu); + + gpf.rcnt += pf->rcnt; + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_BASIC_PCNT, + sizeof(struct tc_basic_pcnt), + &gpf, TCA_BASIC_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 0e408ee9dcec..a1b803fd372e 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/percpu.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> @@ -22,6 +23,7 @@ struct cls_mall_head { u32 handle; u32 flags; unsigned int in_hw_count; + struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -34,6 +36,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; *res = head->res; + __this_cpu_inc(head->pf->rhit); return tcf_exts_exec(skb, &head->exts, res); } @@ -46,6 +49,7 @@ static void __mall_destroy(struct cls_mall_head *head) { tcf_exts_destroy(&head->exts); tcf_exts_put_net(&head->exts); + free_percpu(head->pf); kfree(head); } @@ -192,6 +196,11 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, handle = 1; new->handle = handle; new->flags = flags; + new->pf = alloc_percpu(struct tc_matchall_pcnt); + if (!new->pf) { + err = -ENOMEM; + goto err_alloc_percpu; + } err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, extack); @@ -214,6 +223,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, err_replace_hw_filter: err_set_parms: + free_percpu(new->pf); +err_alloc_percpu: tcf_exts_destroy(&new->exts); err_exts_init: kfree(new); @@ -270,8 +281,10 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { + struct tc_matchall_pcnt gpf = {}; struct cls_mall_head *head = fh; struct nlattr *nest; + int cpu; if (!head) return skb->len; @@ -289,6 +302,17 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu); + + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_MATCHALL_PCNT, + sizeof(struct tc_matchall_pcnt), + &gpf, TCA_MATCHALL_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &head->exts)) goto nla_put_failure; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 7e4d1ccf4c87..03e26e8d0ec9 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -758,8 +758,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev) return 0; } -void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, - unsigned int len) +void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 5df9d1138ac9..cd78253de31d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -556,10 +556,11 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); * Call all network notifier blocks. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, - struct switchdev_notifier_info *info) + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack) { info->dev = dev; - info->extack = NULL; + info->extack = extack; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); diff --git a/net/tipc/link.c b/net/tipc/link.c index 2792a3cae682..ac306d17f8ad 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1126,7 +1126,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, skb_queue_tail(mc_inputq, skb); return true; } - /* else: fall through */ + /* fall through */ case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1217c90a363b..8fc5acd4820d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -735,7 +735,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= EPOLLOUT; - /* fall thru' */ + /* fall through */ case TIPC_LISTEN: if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; @@ -2416,7 +2416,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, * case is EINPROGRESS, rather than EALREADY. */ res = -EINPROGRESS; - /* fall thru' */ + /* fall through */ case TIPC_CONNECTING: if (!timeout) { if (previous == TIPC_CONNECTING) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index a457c0fbbef1..4a708a4e8583 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -60,7 +60,6 @@ * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue - * @max_rcvbuf_size: maximum permitted receive message length * @listener: topsrv listener socket * @name: server name */ @@ -72,7 +71,6 @@ struct tipc_topsrv { struct work_struct awork; struct workqueue_struct *rcv_wq; struct workqueue_struct *send_wq; - int max_rcvbuf_size; struct socket *listener; char name[TIPC_SERVER_NAME_LEN]; }; @@ -648,7 +646,6 @@ int tipc_topsrv_start(struct net *net) return -ENOMEM; srv->net = net; - srv->max_rcvbuf_size = sizeof(struct tipc_subscr); INIT_WORK(&srv->awork, tipc_topsrv_accept); strscpy(srv->name, name, sizeof(srv->name)); diff --git a/net/tipc/trace.c b/net/tipc/trace.c index 964823841efe..265f6a26aa3d 100644 --- a/net/tipc/trace.c +++ b/net/tipc/trace.c @@ -111,7 +111,7 @@ int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf) break; default: break; - }; + } i += scnprintf(buf + i, sz - i, " | %u", msg_src_droppable(hdr)); i += scnprintf(buf + i, sz - i, " %u", @@ -122,7 +122,7 @@ int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf) default: /* need more? */ break; - }; + } i += scnprintf(buf + i, sz - i, "\n"); if (!more) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 78cb4a584080..d36d095cbcf0 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -61,6 +61,8 @@ static LIST_HEAD(device_list); static DEFINE_SPINLOCK(device_spinlock); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; +static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], + struct proto *base); static void update_sk_prot(struct sock *sk, struct tls_context *ctx) { @@ -264,8 +266,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if ((ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) || - (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)) { + if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) + goto skip_tx_cleanup; + + if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) { free_ctx = true; goto skip_tx_cleanup; } @@ -551,6 +555,43 @@ static struct tls_context *create_ctx(struct sock *sk) return ctx; } +static void tls_build_proto(struct sock *sk) +{ + int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + + /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ + if (ip_ver == TLSV6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + mutex_lock(&tcpv6_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(tls_prots[TLSV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + mutex_unlock(&tcpv6_prot_mutex); + } + + if (ip_ver == TLSV4 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) { + mutex_lock(&tcpv4_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv4_prot)) { + build_protos(tls_prots[TLSV4], sk->sk_prot); + smp_store_release(&saved_tcpv4_prot, sk->sk_prot); + } + mutex_unlock(&tcpv4_prot_mutex); + } +} + +static void tls_hw_sk_destruct(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + ctx->sk_destruct(sk); + /* Free ctx */ + kfree(ctx); + icsk->icsk_ulp_data = NULL; +} + static int tls_hw_prot(struct sock *sk) { struct tls_context *ctx; @@ -564,12 +605,17 @@ static int tls_hw_prot(struct sock *sk) if (!ctx) goto out; + spin_unlock_bh(&device_spinlock); + tls_build_proto(sk); ctx->hash = sk->sk_prot->hash; ctx->unhash = sk->sk_prot->unhash; ctx->sk_proto_close = sk->sk_prot->close; + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_hw_sk_destruct; ctx->rx_conf = TLS_HW_RECORD; ctx->tx_conf = TLS_HW_RECORD; update_sk_prot(sk, ctx); + spin_lock_bh(&device_spinlock); rc = 1; break; } @@ -668,7 +714,6 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], static int tls_init(struct sock *sk) { - int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; struct tls_context *ctx; int rc = 0; @@ -691,27 +736,7 @@ static int tls_init(struct sock *sk) goto out; } - /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ - if (ip_ver == TLSV6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - mutex_lock(&tcpv6_prot_mutex); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(tls_prots[TLSV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - mutex_unlock(&tcpv6_prot_mutex); - } - - if (ip_ver == TLSV4 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) { - mutex_lock(&tcpv4_prot_mutex); - if (likely(sk->sk_prot != saved_tcpv4_prot)) { - build_protos(tls_prots[TLSV4], sk->sk_prot); - smp_store_release(&saved_tcpv4_prot, sk->sk_prot); - } - mutex_unlock(&tcpv4_prot_mutex); - } - + tls_build_proto(sk); ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bf5b54b513bc..3f2a6af27e62 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -124,6 +124,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) { struct aead_request *aead_req = (struct aead_request *)req; struct scatterlist *sgout = aead_req->dst; + struct scatterlist *sgin = aead_req->src; struct tls_sw_context_rx *ctx; struct tls_context *tls_ctx; struct scatterlist *sg; @@ -134,12 +135,16 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) skb = (struct sk_buff *)req->data; tls_ctx = tls_get_ctx(skb->sk); ctx = tls_sw_ctx_rx(tls_ctx); - pending = atomic_dec_return(&ctx->decrypt_pending); /* Propagate if there was an err */ if (err) { ctx->async_wait.err = err; tls_err_abort(skb->sk, err); + } else { + struct strp_msg *rxm = strp_msg(skb); + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; } /* After using skb->sk to propagate sk through crypto async callback @@ -147,18 +152,21 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) */ skb->sk = NULL; - /* Release the skb, pages and memory allocated for crypto req */ - kfree_skb(skb); - /* Skip the first S/G entry as it points to AAD */ - for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { - if (!sg) - break; - put_page(sg_page(sg)); + /* Free the destination pages if skb was not decrypted inplace */ + if (sgout != sgin) { + /* Skip the first S/G entry as it points to AAD */ + for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { + if (!sg) + break; + put_page(sg_page(sg)); + } } kfree(aead_req); + pending = atomic_dec_return(&ctx->decrypt_pending); + if (!pending && READ_ONCE(ctx->async_notify)) complete(&ctx->async_wait.completion); } @@ -1022,8 +1030,8 @@ send_end: return copied ? copied : ret; } -int tls_sw_do_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) +static int tls_sw_do_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) { long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -1145,16 +1153,6 @@ sendpage_end: return copied ? copied : ret; } -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) - return -ENOTSUPP; - - return tls_sw_do_sendpage(sk, page, offset, size, flags); -} - int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1283,7 +1281,7 @@ out: static int decrypt_internal(struct sock *sk, struct sk_buff *skb, struct iov_iter *out_iov, struct scatterlist *out_sg, - int *chunk, bool *zc) + int *chunk, bool *zc, bool async) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1383,13 +1381,13 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, fallback_to_reg_recv: sgout = sgin; pages = 0; - *chunk = 0; + *chunk = data_len; *zc = false; } /* Prepare and submit AEAD request */ err = tls_do_decryption(sk, skb, sgin, sgout, iv, - data_len, aead_req, *zc); + data_len, aead_req, async); if (err == -EINPROGRESS) return err; @@ -1402,7 +1400,8 @@ fallback_to_reg_recv: } static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct iov_iter *dest, int *chunk, bool *zc) + struct iov_iter *dest, int *chunk, bool *zc, + bool async) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1415,7 +1414,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, return err; #endif if (!ctx->decrypted) { - err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); + err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, async); if (err < 0) { if (err == -EINPROGRESS) tls_advance_record_sn(sk, &tls_ctx->rx); @@ -1441,7 +1440,7 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb, bool zc = true; int chunk; - return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc); + return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc, false); } static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, @@ -1468,6 +1467,77 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, return true; } +/* This function traverses the rx_list in tls receive context to copies the + * decrypted data records into the buffer provided by caller zero copy is not + * true. Further, the records are removed from the rx_list if it is not a peek + * case and the record has been consumed completely. + */ +static int process_rx_list(struct tls_sw_context_rx *ctx, + struct msghdr *msg, + size_t skip, + size_t len, + bool zc, + bool is_peek) +{ + struct sk_buff *skb = skb_peek(&ctx->rx_list); + ssize_t copied = 0; + + while (skip && skb) { + struct strp_msg *rxm = strp_msg(skb); + + if (skip < rxm->full_len) + break; + + skip = skip - rxm->full_len; + skb = skb_peek_next(skb, &ctx->rx_list); + } + + while (len && skb) { + struct sk_buff *next_skb; + struct strp_msg *rxm = strp_msg(skb); + int chunk = min_t(unsigned int, rxm->full_len - skip, len); + + if (!zc || (rxm->full_len - skip) > len) { + int err = skb_copy_datagram_msg(skb, rxm->offset + skip, + msg, chunk); + if (err < 0) + return err; + } + + len = len - chunk; + copied = copied + chunk; + + /* Consume the data from record if it is non-peek case*/ + if (!is_peek) { + rxm->offset = rxm->offset + chunk; + rxm->full_len = rxm->full_len - chunk; + + /* Return if there is unconsumed data in the record */ + if (rxm->full_len - skip) + break; + } + + /* The remaining skip-bytes must lie in 1st record in rx_list. + * So from the 2nd record, 'skip' should be 0. + */ + skip = 0; + + if (msg) + msg->msg_flags |= MSG_EOR; + + next_skb = skb_peek_next(skb, &ctx->rx_list); + + if (!is_peek) { + skb_unlink(skb, &ctx->rx_list); + kfree_skb(skb); + } + + skb = next_skb; + } + + return copied; +} + int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1478,7 +1548,8 @@ int tls_sw_recvmsg(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; - unsigned char control; + unsigned char control = 0; + ssize_t decrypted = 0; struct strp_msg *rxm; struct sk_buff *skb; ssize_t copied = 0; @@ -1486,6 +1557,7 @@ int tls_sw_recvmsg(struct sock *sk, int target, err = 0; long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); + bool is_peek = flags & MSG_PEEK; int num_async = 0; flags |= nonblock; @@ -1496,11 +1568,28 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); - target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + /* Process pending decrypted records. It must be non-zero-copy */ + err = process_rx_list(ctx, msg, 0, len, false, is_peek); + if (err < 0) { + tls_err_abort(sk, err); + goto end; + } else { + copied = err; + } + + len = len - copied; + if (len) { + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + } else { + goto recv_end; + } + do { - bool zc = false; + bool retain_skb = false; bool async = false; + bool zc = false; + int to_decrypt; int chunk = 0; skb = tls_wait_data(sk, psock, flags, timeo, &err); @@ -1510,7 +1599,7 @@ int tls_sw_recvmsg(struct sock *sk, msg, len, flags); if (ret > 0) { - copied += ret; + decrypted += ret; len -= ret; continue; } @@ -1537,70 +1626,70 @@ int tls_sw_recvmsg(struct sock *sk, goto recv_end; } - if (!ctx->decrypted) { - int to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + to_decrypt = rxm->full_len - tls_ctx->rx.overhead_size; - if (!is_kvec && to_copy <= len && - likely(!(flags & MSG_PEEK))) - zc = true; + if (to_decrypt <= len && !is_kvec && !is_peek) + zc = true; - err = decrypt_skb_update(sk, skb, &msg->msg_iter, - &chunk, &zc); - if (err < 0 && err != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } - - if (err == -EINPROGRESS) { - async = true; - num_async++; - goto pick_next_record; - } - - ctx->decrypted = true; + err = decrypt_skb_update(sk, skb, &msg->msg_iter, + &chunk, &zc, ctx->async_capable); + if (err < 0 && err != -EINPROGRESS) { + tls_err_abort(sk, EBADMSG); + goto recv_end; } - if (!zc) { - chunk = min_t(unsigned int, rxm->full_len, len); + if (err == -EINPROGRESS) { + async = true; + num_async++; + goto pick_next_record; + } else { + if (!zc) { + if (rxm->full_len > len) { + retain_skb = true; + chunk = len; + } else { + chunk = rxm->full_len; + } - err = skb_copy_datagram_msg(skb, rxm->offset, msg, - chunk); - if (err < 0) - goto recv_end; + err = skb_copy_datagram_msg(skb, rxm->offset, + msg, chunk); + if (err < 0) + goto recv_end; + + if (!is_peek) { + rxm->offset = rxm->offset + chunk; + rxm->full_len = rxm->full_len - chunk; + } + } } pick_next_record: - copied += chunk; + if (chunk > len) + chunk = len; + + decrypted += chunk; len -= chunk; - if (likely(!(flags & MSG_PEEK))) { - u8 control = ctx->control; - - /* For async, drop current skb reference */ - if (async) - skb = NULL; - - if (tls_sw_advance_skb(sk, skb, chunk)) { - /* Return full control message to - * userspace before trying to parse - * another message type - */ - msg->msg_flags |= MSG_EOR; - if (control != TLS_RECORD_TYPE_DATA) - goto recv_end; - } else { - break; - } - } else { - /* MSG_PEEK right now cannot look beyond current skb - * from strparser, meaning we cannot advance skb here - * and thus unpause strparser since we'd loose original - * one. + + /* For async or peek case, queue the current skb */ + if (async || is_peek || retain_skb) { + skb_queue_tail(&ctx->rx_list, skb); + skb = NULL; + } + + if (tls_sw_advance_skb(sk, skb, chunk)) { + /* Return full control message to + * userspace before trying to parse + * another message type */ + msg->msg_flags |= MSG_EOR; + if (ctx->control != TLS_RECORD_TYPE_DATA) + goto recv_end; + } else { break; } /* If we have a new message from strparser, continue now. */ - if (copied >= target && !ctx->recv_pkt) + if (decrypted >= target && !ctx->recv_pkt) break; } while (len); @@ -1614,13 +1703,33 @@ recv_end: /* one of async decrypt failed */ tls_err_abort(sk, err); copied = 0; + decrypted = 0; + goto end; } } else { reinit_completion(&ctx->async_wait.completion); } WRITE_ONCE(ctx->async_notify, false); + + /* Drain records from the rx_list & copy if required */ + if (is_peek || is_kvec) + err = process_rx_list(ctx, msg, copied, + decrypted, false, is_peek); + else + err = process_rx_list(ctx, msg, 0, + decrypted, true, is_peek); + if (err < 0) { + tls_err_abort(sk, err); + copied = 0; + goto end; + } + + WARN_ON(decrypted != err); } + copied += decrypted; + +end: release_sock(sk); if (psock) sk_psock_put(sk, psock); @@ -1657,7 +1766,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc); + err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc, false); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -1846,6 +1955,7 @@ void tls_sw_release_resources_rx(struct sock *sk) if (ctx->aead_recv) { kfree_skb(ctx->recv_pkt); ctx->recv_pkt = NULL; + skb_queue_purge(&ctx->rx_list); crypto_free_aead(ctx->aead_recv); strp_stop(&ctx->strp); write_lock_bh(&sk->sk_callback_lock); @@ -1895,6 +2005,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct crypto_aead **aead; struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; + struct crypto_tfm *tfm; char *iv, *rec_seq; int rc = 0; @@ -1941,6 +2052,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; + skb_queue_head_init(&sw_ctx_rx->rx_list); aead = &sw_ctx_rx->aead_recv; } @@ -2008,6 +2120,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_aead; if (sw_ctx_rx) { + tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); + sw_ctx_rx->async_capable = + tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + /* Set up strparser */ memset(&cb, 0, sizeof(cb)); cb.rcv_msg = tls_queue; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 43a1dec08825..a60df252d3cc 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -505,7 +505,7 @@ out: static int __vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - static u32 port = 0; + static u32 port; struct sockaddr_vm new_addr; if (!port) diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 90e4a7152854..0255b33cff4b 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -5,3 +5,11 @@ config XDP_SOCKETS help XDP sockets allows a channel between XDP programs and userspace applications. + +config XDP_SOCKETS_DIAG + tristate "XDP sockets: monitoring interface" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 04f073146256..59dbfdf93dca 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index d4de871e7d4d..5ab236c5c9a5 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -13,12 +13,15 @@ #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> +#include <linux/idr.h> #include "xdp_umem.h" #include "xsk_queue.h" #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +static DEFINE_IDA(umem_ida); + void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; @@ -67,6 +70,7 @@ struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, return NULL; } +EXPORT_SYMBOL(xdp_get_umem_from_qid); static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) { @@ -193,6 +197,8 @@ static void xdp_umem_release(struct xdp_umem *umem) xdp_umem_clear_dev(umem); + ida_simple_remove(&umem_ida, umem->id); + if (umem->fq) { xskq_destroy(umem->fq); umem->fq = NULL; @@ -399,8 +405,16 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) if (!umem) return ERR_PTR(-ENOMEM); + err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL); + if (err < 0) { + kfree(umem); + return ERR_PTR(err); + } + umem->id = err; + err = xdp_umem_reg(umem, mr); if (err) { + ida_simple_remove(&umem_ida, umem->id); kfree(umem); return ERR_PTR(err); } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a03268454a27..949d3bbccb2f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -27,14 +27,10 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#include "xsk.h" #define TX_BATCH_SIZE 16 -static struct xdp_sock *xdp_sk(struct sock *sk) -{ - return (struct xdp_sock *)sk; -} - bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && @@ -350,6 +346,10 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + mutex_lock(&net->xdp.lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->xdp.lock); + local_bh_disable(); sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); @@ -746,6 +746,10 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); + mutex_lock(&net->xdp.lock); + sk_add_node_rcu(sk, &net->xdp.list); + mutex_unlock(&net->xdp.lock); + local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); local_bh_enable(); @@ -759,6 +763,23 @@ static const struct net_proto_family xsk_family_ops = { .owner = THIS_MODULE, }; +static int __net_init xsk_net_init(struct net *net) +{ + mutex_init(&net->xdp.lock); + INIT_HLIST_HEAD(&net->xdp.list); + return 0; +} + +static void __net_exit xsk_net_exit(struct net *net) +{ + WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); +} + +static struct pernet_operations xsk_net_ops = { + .init = xsk_net_init, + .exit = xsk_net_exit, +}; + static int __init xsk_init(void) { int err; @@ -771,8 +792,13 @@ static int __init xsk_init(void) if (err) goto out_proto; + err = register_pernet_subsys(&xsk_net_ops); + if (err) + goto out_sk; return 0; +out_sk: + sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h new file mode 100644 index 000000000000..ba8120610426 --- /dev/null +++ b/net/xdp/xsk.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. */ + +#ifndef XSK_H_ +#define XSK_H_ + +static inline struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +#endif /* XSK_H_ */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c new file mode 100644 index 000000000000..661d007c3b28 --- /dev/null +++ b/net/xdp/xsk_diag.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets monitoring support + * + * Copyright(c) 2019 Intel Corporation. + * + * Author: Björn Töpel <bjorn.topel@intel.com> + */ + +#include <linux/module.h> +#include <net/xdp_sock.h> +#include <linux/xdp_diag.h> +#include <linux/sock_diag.h> + +#include "xsk_queue.h" +#include "xsk.h" + +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_info di = {}; + + di.ifindex = xs->dev ? xs->dev->ifindex : 0; + di.queue_id = xs->queue_id; + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); +} + +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, + struct sk_buff *nlskb) +{ + struct xdp_diag_ring dr = {}; + + dr.entries = queue->nentries; + return nla_put(nlskb, nl_type, sizeof(dr), &dr); +} + +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, + struct sk_buff *nlskb) +{ + int err = 0; + + if (xs->rx) + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); + if (!err && xs->tx) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); + return err; +} + +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_umem *umem = xs->umem; + struct xdp_diag_umem du = {}; + int err; + + if (!umem) + return 0; + + du.id = umem->id; + du.size = umem->size; + du.num_pages = umem->npgs; + du.chunk_size = (__u32)(~umem->chunk_mask + 1); + du.headroom = umem->headroom; + du.ifindex = umem->dev ? umem->dev->ifindex : 0; + du.queue_id = umem->queue_id; + du.flags = 0; + if (umem->zc) + du.flags |= XDP_DU_F_ZEROCOPY; + du.refs = refcount_read(&umem->users); + + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); + + if (!err && umem->fq) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb); + if (!err && umem->cq) { + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING, + nlskb); + } + return err; +} + +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, + struct xdp_diag_req *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_diag_msg *msg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), + flags); + if (!nlh) + return -EMSGSIZE; + + msg = nlmsg_data(nlh); + memset(msg, 0, sizeof(*msg)); + msg->xdiag_family = AF_XDP; + msg->xdiag_type = sk->sk_type; + msg->xdiag_ino = sk_ino; + sock_diag_save_cookie(sk, msg->xdiag_cookie); + + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_INFO) && + nla_put_u32(nlskb, XDP_DIAG_UID, + from_kuid_munged(user_ns, sock_i_uid(sk)))) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && + xsk_diag_put_rings_cfg(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_UMEM) && + xsk_diag_put_umem(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + nlmsg_end(nlskb, nlh); + return 0; + +out_nlmsg_trim: + nlmsg_cancel(nlskb, nlh); + return -EMSGSIZE; +} + +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) +{ + struct xdp_diag_req *req = nlmsg_data(cb->nlh); + struct net *net = sock_net(nlskb->sk); + int num = 0, s_num = cb->args[0]; + struct sock *sk; + + mutex_lock(&net->xdp.lock); + + sk_for_each(sk, &net->xdp.list) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num++ < s_num) + continue; + + if (xsk_diag_fill(sk, nlskb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + num--; + break; + } + } + + mutex_unlock(&net->xdp.lock); + cb->args[0] = num; + return nlskb->len; +} + +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) +{ + struct netlink_dump_control c = { .dump = xsk_diag_dump }; + int hdrlen = sizeof(struct xdp_diag_req); + struct net *net = sock_net(nlskb->sk); + + if (nlmsg_len(hdr) < hdrlen) + return -EINVAL; + + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) + return -EOPNOTSUPP; + + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); +} + +static const struct sock_diag_handler xsk_diag_handler = { + .family = AF_XDP, + .dump = xsk_diag_handler_dump, +}; + +static int __init xsk_diag_init(void) +{ + return sock_diag_register(&xsk_diag_handler); +} + +static void __exit xsk_diag_exit(void) +{ + sock_diag_unregister(&xsk_diag_handler); +} + +module_init(xsk_diag_init); +module_exit(xsk_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); |