diff options
Diffstat (limited to 'net')
106 files changed, 3496 insertions, 1367 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 46c39f7da444..e6014e0e51f7 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -32,7 +32,7 @@ config NET_9P_XEN config NET_9P_RDMA - depends on INET && INFINIBAND_ADDR_TRANS + depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS tristate "9P RDMA Transport (Experimental)" help This builds support for an RDMA transport. diff --git a/net/Kconfig b/net/Kconfig index ba554cedb615..f738a6f27665 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -432,6 +432,19 @@ config MAY_USE_DEVLINK config PAGE_POOL bool +config FAILOVER + tristate "Generic failover module" + help + The failover module provides a generic interface for paravirtual + drivers to register a netdev and a set of ops with a failover + instance. The ops are used as event handlers that get called to + handle netdev register/unregister/link change/name change events + on slave pci ethernet devices with the same mac address as the + failover netdev. This enables paravirtual drivers to use a + VF as an accelerated low latency datapath. It also allows live + migration of VMs with direct attached VFs by failing over to the + paravirtual datapath when the VF is unplugged. + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/Makefile b/net/Makefile index bdaf53925acd..13ec0d5415c7 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,7 +20,11 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ obj-$(CONFIG_NET) += ipv6/ +ifneq ($(CC_CAN_LINK),y) +$(warning CC cannot link executables. Skipping bpfilter.) +else obj-$(CONFIG_BPFILTER) += bpfilter/ +endif obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b0ee9edaae35..1dec33790198 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -76,19 +76,15 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, { struct hci_dev *hdev = file->private_data; struct sk_buff *skb; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; + int err; if (!test_bit(HCI_UP, &hdev->flags)) return -ENETDOWN; - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) - return -EINVAL; + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) return -EALREADY; @@ -135,17 +131,12 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; int err; - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) - return -EINVAL; + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; /* When the diagnostic flags are not persistent and the transport * is not active or in user channel operation, then there is no need diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 418b76e557b0..0d8ab5b3c177 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -47,19 +47,15 @@ static ssize_t __name ## _write(struct file *file, \ size_t count, loff_t *ppos) \ { \ struct hci_dev *hdev = file->private_data; \ - char buf[32]; \ - size_t buf_size = min(count, (sizeof(buf) - 1)); \ bool enable; \ + int err; \ \ if (test_bit(HCI_UP, &hdev->flags)) \ return -EBUSY; \ \ - if (copy_from_user(buf, user_buf, buf_size)) \ - return -EFAULT; \ - \ - buf[buf_size] = '\0'; \ - if (strtobool(buf, &enable)) \ - return -EINVAL; \ + err = kstrtobool_from_user(user_buf, count, &enable); \ + if (err) \ + return err; \ \ if (enable == test_bit(__quirk, &hdev->quirks)) \ return -EALREADY; \ @@ -658,19 +654,15 @@ static ssize_t force_static_address_write(struct file *file, size_t count, loff_t *ppos) { struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; + int err; if (test_bit(HCI_UP, &hdev->flags)) return -EBUSY; - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) - return -EINVAL; + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR)) return -EALREADY; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a2ddae2f37d7..ae91e2d40056 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3315,16 +3315,12 @@ static ssize_t force_bredr_smp_write(struct file *file, size_t count, loff_t *ppos) { struct hci_dev *hdev = file->private_data; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; + int err; - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) - return -EINVAL; + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 2af752c8ef5e..aafa72001fcd 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -5,7 +5,9 @@ hostprogs-y := bpfilter_umh bpfilter_umh-objs := main.o -HOSTCFLAGS += -I. -Itools/include/ +HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi +HOSTCC := $(CC) + ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be compiled with -static # since rootfs isn't mounted at the time of __init diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 7596314b61c7..b13d058f8c34 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -33,7 +33,8 @@ static void shutdown_umh(struct umh_info *info) static void __stop_umh(void) { - if (bpfilter_process_sockopt) { + if (IS_ENABLED(CONFIG_INET) && + bpfilter_process_sockopt) { bpfilter_process_sockopt = NULL; shutdown_umh(&info); } @@ -98,7 +99,9 @@ static int __init load_umh(void) stop_umh(); return -EFAULT; } - bpfilter_process_sockopt = &__bpfilter_process_sockopt; + if (IS_ENABLED(CONFIG_INET)) + bpfilter_process_sockopt = &__bpfilter_process_sockopt; + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 11520ed528b0..5216a524b537 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1139,6 +1139,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long mask); void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type); +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags); +int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { @@ -1168,6 +1170,17 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; } +static inline int br_switchdev_port_vlan_add(struct net_device *dev, + u16 vid, u16 flags) +{ + return -EOPNOTSUPP; +} + +static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) +{ + return -EOPNOTSUPP; +} + static inline void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 35474d49555d..d77f807420c4 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -136,3 +136,28 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) break; } } + +int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags) +{ + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; + + return switchdev_port_obj_add(dev, &v.obj); +} + +int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) +{ + struct switchdev_obj_port_vlan v = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + }; + + return switchdev_port_obj_del(dev, &v.obj); +} diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index dc832c0934c6..7df269092103 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -82,19 +82,12 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, u16 vid, u16 flags) { - struct switchdev_obj_port_vlan v = { - .obj.orig_dev = dev, - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = flags, - .vid_begin = vid, - .vid_end = vid, - }; int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = switchdev_port_obj_add(dev, &v.obj); + err = br_switchdev_port_vlan_add(dev, vid, flags); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, vid); return err; @@ -130,18 +123,12 @@ static void __vlan_del_list(struct net_bridge_vlan *v) static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, u16 vid) { - struct switchdev_obj_port_vlan v = { - .obj.orig_dev = dev, - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .vid_begin = vid, - .vid_end = vid, - }; int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ - err = switchdev_port_obj_del(dev, &v.obj); + err = br_switchdev_port_vlan_del(dev, vid); if (err == -EOPNOTSUPP) { vlan_vid_del(dev, br->vlan_proto, vid); return 0; @@ -259,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) goto out_filt; v->brvlan = masterv; v->stats = masterv->stats; + } else { + err = br_switchdev_port_vlan_add(dev, v->vid, flags); + if (err && err != -EOPNOTSUPP) + goto out; } /* Add the dev mac and count the vlan only if it's usable */ @@ -294,6 +285,8 @@ out_filt: br_vlan_put_master(masterv); v->brvlan = NULL; } + } else { + br_switchdev_port_vlan_del(dev, v->vid); } goto out; @@ -319,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v) err = __vlan_vid_del(p->dev, p->br, v->vid); if (err) goto out; + } else { + err = br_switchdev_port_vlan_del(v->br->dev, v->vid); + if (err && err != -EOPNOTSUPP) + goto out; + err = 0; } if (br_vlan_should_use(v)) { @@ -564,6 +562,48 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) return false; } +static int br_vlan_add_existing(struct net_bridge *br, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan, + u16 flags, bool *changed) +{ + int err; + + err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags); + if (err && err != -EOPNOTSUPP) + return err; + + if (!br_vlan_is_brentry(vlan)) { + /* Trying to change flags of non-existent bridge vlan */ + if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) { + err = -EINVAL; + goto err_flags; + } + /* It was only kept for port vlans, now make it real */ + err = br_fdb_insert(br, NULL, br->dev->dev_addr, + vlan->vid); + if (err) { + br_err(br, "failed to insert local address into bridge forwarding table\n"); + goto err_fdb_insert; + } + + refcount_inc(&vlan->refcnt); + vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; + vg->num_vlans++; + *changed = true; + } + + if (__vlan_add_flags(vlan, flags)) + *changed = true; + + return 0; + +err_fdb_insert: +err_flags: + br_switchdev_port_vlan_del(br->dev, vlan->vid); + return err; +} + /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated @@ -579,28 +619,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed) *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); - if (vlan) { - if (!br_vlan_is_brentry(vlan)) { - /* Trying to change flags of non-existent bridge vlan */ - if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) - return -EINVAL; - /* It was only kept for port vlans, now make it real */ - ret = br_fdb_insert(br, NULL, br->dev->dev_addr, - vlan->vid); - if (ret) { - br_err(br, "failed insert local address into bridge forwarding table\n"); - return ret; - } - refcount_inc(&vlan->refcnt); - vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; - vg->num_vlans++; - *changed = true; - } - if (__vlan_add_flags(vlan, flags)) - *changed = true; - - return 0; - } + if (vlan) + return br_vlan_add_existing(br, vg, vlan, flags, changed); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) @@ -1053,13 +1073,6 @@ err_vlan_enabled: int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed) { - struct switchdev_obj_port_vlan v = { - .obj.orig_dev = port->dev, - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = flags, - .vid_begin = vid, - .vid_end = vid, - }; struct net_bridge_vlan *vlan; int ret; @@ -1069,7 +1082,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { /* Pass the flags to the hardware bridge */ - ret = switchdev_port_obj_add(port->dev, &v.obj); + ret = br_switchdev_port_vlan_add(port->dev, vid, flags); if (ret && ret != -EOPNOTSUPP) return ret; *changed = __vlan_add_flags(vlan, flags); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index b286ed5596c3..c2138e7e8263 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1949,7 +1949,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, int off, pad = 0; unsigned int size_kern, match_size = mwt->match_size; - strlcpy(name, mwt->u.name, sizeof(name)); + if (strscpy(name, mwt->u.name, sizeof(name)) < 0) + return -EINVAL; if (state->buf_kern_start) dst = state->buf_kern_start + state->buf_kern_offset; diff --git a/net/core/Makefile b/net/core/Makefile index 7080417f8bc8..80175e6a2eb8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_FAILOVER) += failover.o diff --git a/net/core/devlink.c b/net/core/devlink.c index 475246b355f0..22099705cc41 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -702,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, return 0; } -static int devlink_port_split(struct devlink *devlink, - u32 port_index, u32 count) +static int devlink_port_split(struct devlink *devlink, u32 port_index, + u32 count, struct netlink_ext_ack *extack) { if (devlink->ops && devlink->ops->port_split) - return devlink->ops->port_split(devlink, port_index, count); + return devlink->ops->port_split(devlink, port_index, count, + extack); return -EOPNOTSUPP; } @@ -724,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - return devlink_port_split(devlink, port_index, count); + return devlink_port_split(devlink, port_index, count, info->extack); } -static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index, + struct netlink_ext_ack *extack) { if (devlink->ops && devlink->ops->port_unsplit) - return devlink->ops->port_unsplit(devlink, port_index); + return devlink->ops->port_unsplit(devlink, port_index, extack); return -EOPNOTSUPP; } @@ -745,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return -EINVAL; port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - return devlink_port_unsplit(devlink, port_index); + return devlink_port_unsplit(devlink, port_index, info->extack); } static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, @@ -1826,7 +1828,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2032,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) return 0; nla_put_failure: - genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr); nlmsg_free(dump_ctx->skb); return -EMSGSIZE; } @@ -2249,7 +2249,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_table_put: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2551,7 +2550,6 @@ nla_put_failure: err = -EMSGSIZE; err_resource_put: err_skb_send_alloc: - genlmsg_cancel(skb, hdr); nlmsg_free(skb); return err; } @@ -2603,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - return devlink->ops->reload(devlink); + return devlink->ops->reload(devlink, info->extack); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { diff --git a/net/core/failover.c b/net/core/failover.c new file mode 100644 index 000000000000..4a92a98ccce9 --- /dev/null +++ b/net/core/failover.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <uapi/linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <net/failover.h> + +static LIST_HEAD(failover_list); +static DEFINE_SPINLOCK(failover_lock); + +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) +{ + struct net_device *failover_dev; + struct failover *failover; + + spin_lock(&failover_lock); + list_for_each_entry(failover, &failover_list, list) { + failover_dev = rtnl_dereference(failover->failover_dev); + if (ether_addr_equal(failover_dev->perm_addr, mac)) { + *ops = rtnl_dereference(failover->ops); + spin_unlock(&failover_lock); + return failover_dev; + } + } + spin_unlock(&failover_lock); + return NULL; +} + +/** + * failover_slave_register - Register a slave netdev + * + * @slave_dev: slave netdev that is being registered + * + * Registers a slave device to a failover instance. Only ethernet devices + * are supported. + */ +static int failover_slave_register(struct net_device *slave_dev) +{ + struct netdev_lag_upper_info lag_upper_info; + struct net_device *failover_dev; + struct failover_ops *fops; + int err; + + if (slave_dev->type != ARPHRD_ETHER) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_register && + fops->slave_pre_register(slave_dev, failover_dev)) + goto done; + + err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, + failover_dev); + if (err) { + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", + err); + goto done; + } + + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; + err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, + &lag_upper_info, NULL); + if (err) { + netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", + failover_dev->name, err); + goto err_upper_link; + } + + slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + + if (fops && fops->slave_register && + !fops->slave_register(slave_dev, failover_dev)) + return NOTIFY_OK; + + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; +err_upper_link: + netdev_rx_handler_unregister(slave_dev); +done: + return NOTIFY_DONE; +} + +/** + * failover_slave_unregister - Unregister a slave netdev + * + * @slave_dev: slave netdev that is being unregistered + * + * Unregisters a slave device from a failover instance. + */ +int failover_slave_unregister(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_unregister && + fops->slave_pre_unregister(slave_dev, failover_dev)) + goto done; + + netdev_rx_handler_unregister(slave_dev); + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + + if (fops && fops->slave_unregister && + !fops->slave_unregister(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(failover_slave_unregister); + +static int failover_slave_link_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_link_change && + !fops->slave_link_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int failover_slave_name_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_name_change && + !fops->slave_name_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int +failover_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip parent events */ + if (netif_is_failover(event_dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return failover_slave_register(event_dev); + case NETDEV_UNREGISTER: + return failover_slave_unregister(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + return failover_slave_link_change(event_dev); + case NETDEV_CHANGENAME: + return failover_slave_name_change(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block failover_notifier = { + .notifier_call = failover_event, +}; + +static void +failover_existing_slave_register(struct net_device *failover_dev) +{ + struct net *net = dev_net(failover_dev); + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) { + if (netif_is_failover(dev)) + continue; + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + failover_slave_register(dev); + } + rtnl_unlock(); +} + +/** + * failover_register - Register a failover instance + * + * @dev: failover netdev + * @ops: failover ops + * + * Allocate and register a failover instance for a failover netdev. ops + * provides handlers for slave device register/unregister/link change/ + * name change events. + * + * Return: pointer to failover instance + */ +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops) +{ + struct failover *failover; + + if (dev->type != ARPHRD_ETHER) + return ERR_PTR(-EINVAL); + + failover = kzalloc(sizeof(*failover), GFP_KERNEL); + if (!failover) + return ERR_PTR(-ENOMEM); + + rcu_assign_pointer(failover->ops, ops); + dev_hold(dev); + dev->priv_flags |= IFF_FAILOVER; + rcu_assign_pointer(failover->failover_dev, dev); + + spin_lock(&failover_lock); + list_add_tail(&failover->list, &failover_list); + spin_unlock(&failover_lock); + + netdev_info(dev, "failover master:%s registered\n", dev->name); + + failover_existing_slave_register(dev); + + return failover; +} +EXPORT_SYMBOL_GPL(failover_register); + +/** + * failover_unregister - Unregister a failover instance + * + * @failover: pointer to failover instance + * + * Unregisters and frees a failover instance. + */ +void failover_unregister(struct failover *failover) +{ + struct net_device *failover_dev; + + failover_dev = rcu_dereference(failover->failover_dev); + + netdev_info(failover_dev, "failover master:%s unregistered\n", + failover_dev->name); + + failover_dev->priv_flags &= ~IFF_FAILOVER; + dev_put(failover_dev); + + spin_lock(&failover_lock); + list_del(&failover->list); + spin_unlock(&failover_lock); + + kfree(failover); +} +EXPORT_SYMBOL_GPL(failover_unregister); + +static __init int +failover_init(void) +{ + register_netdevice_notifier(&failover_notifier); + + return 0; +} +module_init(failover_init); + +static __exit +void failover_exit(void) +{ + unregister_netdevice_notifier(&failover_notifier); +} +module_exit(failover_exit); + +MODULE_DESCRIPTION("Generic failover infrastructure/interface"); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4fc1e84d77ec..53f96e4f7bf5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; - keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c476f0794132..bb7e80f4ced3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1214,9 +1214,6 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, cpumask_var_t mask; unsigned long index; - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - index = get_netdev_queue_index(queue); if (dev->num_tc) { @@ -1226,6 +1223,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -EINVAL; } + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 80802546c279..9a1ba2015ad8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,6 +59,9 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#define RTNL_MAX_TYPE 48 +#define RTNL_SLAVE_MAX_TYPE 36 + struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; @@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops) { int err; + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + rtnl_lock(); err = __rtnl_link_register(ops); rtnl_unlock(); @@ -2902,13 +2910,16 @@ replay: } if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; - struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; struct nlattr **data = NULL; struct nlattr **slave_data = NULL; struct net *dest_net, *link_net = NULL; if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], @@ -2925,6 +2936,9 @@ replay: } if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { err = nla_parse_nested(slave_attr, diff --git a/net/core/sock.c b/net/core/sock.c index 435a0ba85e52..feca4c98f8a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -728,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: - sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && + inet_sk(sk)->inet_num && + (sk->sk_reuse != val)) { + ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; + break; + } + sk->sk_reuse = val; break; case SO_REUSEPORT: + if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && + inet_sk(sk)->inet_num && + (sk->sk_reuseport != valbool)) { + ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; + break; + } sk->sk_reuseport = valbool; break; case SO_TYPE: diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 37ccbe62eb1a..ba6fc3c1186b 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (timeo < rto) timeo = rto; - tw->tw_timeout = DCCP_TIMEWAIT_LEN; if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; diff --git a/net/dsa/port.c b/net/dsa/port.c index 2413beb995be..ed0595459df1 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp, .vlan = vlan, }; + if (netif_is_bridge_master(vlan->obj.orig_dev)) + return -EOPNOTSUPP; + if (br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info); @@ -267,6 +270,9 @@ int dsa_port_vlan_del(struct dsa_port *dp, .vlan = vlan, }; + if (netif_is_bridge_master(vlan->obj.orig_dev)) + return -EOPNOTSUPP; + if (br_vlan_enabled(dp->bridge_dev)) return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 42a96d2d8d05..5e04ed25bc0e 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -10,8 +10,9 @@ int (*bpfilter_process_sockopt)(struct sock *sk, int optname, unsigned int optlen, bool is_set); EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); -int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, - unsigned int optlen, bool is_set) +static int bpfilter_mbox_request(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set) { if (!bpfilter_process_sockopt) { int err = request_module("bpfilter"); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 40f001782c1b..d7585ab1a77a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, + [IFA_RT_PRIORITY] = { .type = NLA_U32 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 @@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (tb[IFA_RT_PRIORITY]) + ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { + u32 new_metric = ifa->ifa_rt_priority; + inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) return -EEXIST; ifa = ifa_existing; + + if (ifa->ifa_rt_priority != new_metric) { + fib_modify_prefix_metric(ifa, new_metric); + ifa->ifa_rt_priority = new_metric; + } + set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&check_lifetime_work); queue_delayed_work(system_power_efficient_wq, @@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } @@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || + (ifa->ifa_rt_priority && + nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, preferred, valid)) goto nla_put_failure; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b69e2824c761..63aa39b3af03 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -847,7 +847,8 @@ out_err: * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ -static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) +static void fib_magic(int cmd, int type, __be32 dst, int dst_len, + struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); @@ -857,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, + .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, @@ -902,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } - fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, + prim, 0); if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - prefix, ifa->ifa_prefixlen, prim); + prefix, ifa->ifa_prefixlen, prim, + ifa->ifa_rt_priority); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, + prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, - 32, prim); + 32, prim, 0); } } } +void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) +{ + __be32 prefix = ifa->ifa_address & ifa->ifa_mask; + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + + if (!(dev->flags & IFF_UP) || + ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || + ipv4_is_zeronet(prefix) || + prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32) + return; + + /* add the new */ + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, ifa, new_metric); + + /* delete the old */ + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); +} + /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. @@ -968,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); + any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } @@ -1052,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) no_promotions: if (!(ok & BRD_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, + prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, + prim, 0); if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, + prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; - fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6608db23f54b..f3c89ccf14c5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -717,6 +717,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) nla_strlcpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { + if (nla_len(nla) != sizeof(u32)) + return false; val = nla_get_u32(nla); } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 6b0e362cc99b..38d906baf1df 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -328,7 +328,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; - mtu = tdev->mtu; + mtu = min(tdev->mtu, IP_MAX_MTU); } dev->needed_headroom = t_hlen + hlen; @@ -362,7 +362,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; + dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; ip_tunnel_add(itn, nt); return nt; @@ -930,7 +930,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); - int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; + int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; if (new_mtu < ETH_MIN_MTU) return -EINVAL; @@ -1107,7 +1107,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], mtu = ip_tunnel_bind_dev(dev); if (tb[IFLA_MTU]) { - unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen; + unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen; mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, (unsigned int)(max - sizeof(struct iphdr))); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 30221701614c..cafb0506c8c9 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id, struct net *net)) { struct mr_table *mrt; + int err; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); if (!mrt) - return NULL; + return ERR_PTR(-ENOMEM); mrt->id = id; write_pnet(&mrt->net, net); mrt->ops = *ops; - if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) { + err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); + if (err) { kfree(mrt); - return NULL; + return ERR_PTR(err); } INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 5121c6475e6b..04311f7067e2 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -32,6 +32,8 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, if (val == TCP_CA_UNSPEC) return -EINVAL; } else { + if (nla_len(nla) != sizeof(u32)) + return -EINVAL; val = nla_get_u32(nla); } if (type == RTAX_ADVMSS && val > 65535 - 40) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 280048e1e395..bbfc356cb1b5 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -29,7 +29,10 @@ config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help This option enables the IPv4 socket lookup infrastructure. This is - is required by the iptables socket match. + is required by the {ip,nf}tables socket match. + +config NF_TPROXY_IPV4 + tristate "IPv4 tproxy support" if NF_TABLES @@ -129,10 +132,7 @@ config NFT_CHAIN_NAT_IPV4 source and destination ports. config NF_NAT_MASQUERADE_IPV4 - tristate "IPv4 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection). + bool config NFT_MASQ_IPV4 tristate "IPv4 masquerading support for nf_tables" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 0e5edd0c7926..8394c17c269f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -10,12 +10,14 @@ nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o +obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o # logging obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o @@ -32,9 +34,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o - - # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index f538c5001547..ad3aeff152ed 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -7,7 +7,6 @@ */ #include <linux/types.h> -#include <linux/module.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/ip.h> @@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void) unregister_inetaddr_notifier(&masq_inet_notifier); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c new file mode 100644 index 000000000000..805e83ec3ad9 --- /dev/null +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <net/netfilter/nf_tproxy.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/inet_sock.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <linux/inetdevice.h> + +struct sock * +nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NF_TPROXY_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} +EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); + +__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) +{ + struct in_device *indev; + __be32 laddr; + + if (user_laddr) + return user_laddr; + + laddr = 0; + indev = __in_dev_get_rcu(skb->dev); + for_primary_ifa(indev) { + laddr = ifa->ifa_local; + break; + } endfor_ifa(indev); + + return laddr ? laddr : daddr; +} +EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); + +struct sock * +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + struct tcphdr *tcph; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NF_TPROXY_LOOKUP_LISTENER: + tcph = hp; + sk = inet_lookup_listener(net, &tcp_hashinfo, skb, + ip_hdrlen(skb) + + __tcp_hdrlen(tcph), + saddr, sport, + daddr, dport, + in->ifindex, 0); + + if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NF_TPROXY_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && + (!connected || wildcard)) || + (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} +EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d2eed3ddcb0a..d06247ba08b2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -30,6 +30,7 @@ static int zero; static int one = 1; +static int two = 2; static int four = 4; static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; @@ -845,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_tw_reuse, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "tcp_max_tw_buckets", diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1191cac72109..355d3dffd021 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -254,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -263,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode((struct sock *)tp, 1); + tcp_enter_quickack_mode(sk, 1); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn((struct sock *)tp)) - tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (tcp_ca_needs_ecn(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode((struct sock *)tp, 1); + tcp_enter_quickack_mode(sk, 1); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn((struct sock *)tp)) - tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); + if (tcp_ca_needs_ecn(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; } } -static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { - if (tp->ecn_flags & TCP_ECN_OK) - __tcp_ecn_check_ce(tp, skb); + if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(sk, skb); } static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) @@ -710,7 +712,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } icsk->icsk_ack.lrcvtime = now; - tcp_ecn_check_ce(tp, skb); + tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -4434,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) u32 seq, end_seq; bool fragstolen; - tcp_ecn_check_ce(tp, skb); + tcp_ecn_check_ce(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -5390,11 +5392,11 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { - unsigned int len = skb->len; + const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); + unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index adbdb503db0c..633963e228bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { + const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; + + if (reuse == 2) { + /* Still does not detect *everything* that goes through + * lo, since we require a loopback src or dst address + * or direct binding to 'lo' interface. + */ + bool loopback = false; + if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) + loopback = true; +#if IS_ENABLED(CONFIG_IPV6) + if (tw->tw_family == AF_INET6) { + if (ipv6_addr_loopback(&tw->tw_v6_daddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && + (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && + (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + loopback = true; + } else +#endif + { + if (ipv4_is_loopback(tw->tw_daddr) || + ipv4_is_loopback(tw->tw_rcv_saddr)) + loopback = true; + } + if (!loopback) + reuse = 0; + } /* With PAWS, it is safe from the viewpoint of data integrity. Even without PAWS it is safe provided sequence @@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && - get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -1486,7 +1515,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); return 0; } @@ -2529,7 +2558,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; - net->ipv4.sysctl_tcp_tw_reuse = 0; + net->ipv4.sysctl_tcp_tw_reuse = 2; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f867658b4b30..1dda1341a223 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -307,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (timeo < rto) timeo = rto; - tw->tw_timeout = TCP_TIMEWAIT_LEN; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3c27d00b5730..9c41a45a9d39 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ -#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ - IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \ - IS_ENABLED(CONFIG_NF_SOCKET_IPV4) +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fbfd71a2d9c8..5596d87952d4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -986,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * -ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, - const struct in6_addr *peer_addr, int pfxlen, - int scope, u32 flags, u32 valid_lft, u32 prefered_lft, +ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, bool can_block, struct netlink_ext_ack *extack) { gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; + int addr_type = ipv6_addr_type(cfg->pfx); struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; struct fib6_info *f6i = NULL; int err = 0; - int addr_type = ipv6_addr_type(addr); if (addr_type == IPV6_ADDR_ANY || addr_type & IPV6_ADDR_MULTICAST || @@ -1019,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, */ if (can_block) { struct in6_validator_info i6vi = { - .i6vi_addr = *addr, + .i6vi_addr = *cfg->pfx, .i6vi_dev = idev, .extack = extack, }; @@ -1036,7 +1034,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags); + f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); f6i = NULL; @@ -1049,21 +1047,22 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, neigh_parms_data_state_setall(idev->nd_parms); - ifa->addr = *addr; - if (peer_addr) - ifa->peer_addr = *peer_addr; + ifa->addr = *cfg->pfx; + if (cfg->peer_pfx) + ifa->peer_addr = *cfg->peer_pfx; spin_lock_init(&ifa->lock); INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); INIT_HLIST_NODE(&ifa->addr_lst); - ifa->scope = scope; - ifa->prefix_len = pfxlen; - ifa->flags = flags; + ifa->scope = cfg->scope; + ifa->prefix_len = cfg->plen; + ifa->rt_priority = cfg->rt_priority; + ifa->flags = cfg->ifa_flags; /* No need to add the TENTATIVE flag for addresses with NODAD */ - if (!(flags & IFA_F_NODAD)) + if (!(cfg->ifa_flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; - ifa->valid_lft = valid_lft; - ifa->prefered_lft = prefered_lft; + ifa->valid_lft = cfg->valid_lft; + ifa->prefered_lft = cfg->preferred_lft; ifa->cstamp = ifa->tstamp = jiffies; ifa->tokenized = false; @@ -1260,11 +1259,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; - unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; + unsigned long tmp_tstamp, age; unsigned long regen_advance; - int tmp_plen; + struct ifa6_config cfg; int ret = 0; - u32 addr_flags; unsigned long now = jiffies; long max_desync_factor; s32 cnf_temp_preferred_lft; @@ -1326,13 +1324,12 @@ retry: } } - tmp_valid_lft = min_t(__u32, - ifp->valid_lft, + cfg.valid_lft = min_t(__u32, ifp->valid_lft, idev->cnf.temp_valid_lft + age); - tmp_prefered_lft = cnf_temp_preferred_lft + age - - idev->desync_factor; - tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft); - tmp_plen = ifp->prefix_len; + cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; + cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft); + + cfg.plen = ifp->prefix_len; tmp_tstamp = ifp->tstamp; spin_unlock_bh(&ifp->lock); @@ -1346,21 +1343,23 @@ retry: * temporary addresses being generated. */ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; - if (tmp_prefered_lft <= regen_advance + age) { + if (cfg.preferred_lft <= regen_advance + age) { in6_ifa_put(ifp); in6_dev_put(idev); ret = -1; goto out; } - addr_flags = IFA_F_TEMPORARY; + cfg.ifa_flags = IFA_F_TEMPORARY; /* set in addrconf_prefix_rcv() */ if (ifp->flags & IFA_F_OPTIMISTIC) - addr_flags |= IFA_F_OPTIMISTIC; + cfg.ifa_flags |= IFA_F_OPTIMISTIC; + + cfg.pfx = &addr; + cfg.scope = ipv6_addr_scope(cfg.pfx); + cfg.rt_priority = 0; - ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, - ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft, block, NULL); + ift = ipv6_add_addr(idev, &cfg, block, NULL); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2031,13 +2030,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) spin_lock_bh(&ifp->lock); if (ifp->flags & IFA_F_STABLE_PRIVACY) { - int scope = ifp->scope; - u32 flags = ifp->flags; struct in6_addr new_addr; struct inet6_ifaddr *ifp2; - u32 valid_lft, preferred_lft; - int pfxlen = ifp->prefix_len; int retries = ifp->stable_privacy_retry + 1; + struct ifa6_config cfg = { + .pfx = &new_addr, + .plen = ifp->prefix_len, + .ifa_flags = ifp->flags, + .valid_lft = ifp->valid_lft, + .preferred_lft = ifp->prefered_lft, + .scope = ifp->scope, + }; if (retries > net->ipv6.sysctl.idgen_retries) { net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n", @@ -2050,9 +2053,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) idev)) goto errdad; - valid_lft = ifp->valid_lft; - preferred_lft = ifp->prefered_lft; - spin_unlock_bh(&ifp->lock); if (idev->cnf.max_addresses && @@ -2063,9 +2063,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n", ifp->idev->dev->name); - ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, - scope, flags, valid_lft, - preferred_lft, false, NULL); + ifp2 = ipv6_add_addr(idev, &cfg, false, NULL); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2253,6 +2251,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_ieee1394(eui, dev); case ARPHRD_TUNNEL6: case ARPHRD_IP6GRE: + case ARPHRD_RAWIP: return addrconf_ifid_ip6tnl(eui, dev); } return -1; @@ -2318,12 +2317,13 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad */ static void -addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, - unsigned long expires, u32 flags, gfp_t gfp_flags) +addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric, + struct net_device *dev, unsigned long expires, + u32 flags, gfp_t gfp_flags) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, - .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF, .fc_ifindex = dev->ifindex, .fc_expires = expires, .fc_dst_len = plen, @@ -2507,12 +2507,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; + struct ifa6_config cfg = { + .pfx = addr, + .plen = pinfo->prefix_len, + .ifa_flags = addr_flags, + .valid_lft = valid_lft, + .preferred_lft = prefered_lft, + .scope = addr_type & IPV6_ADDR_SCOPE_MASK, + }; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if ((net->ipv6.devconf_all->optimistic_dad || in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) - addr_flags |= IFA_F_OPTIMISTIC; + cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif /* Do not allow to create too much of autoconfigured @@ -2520,11 +2528,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, */ if (!max_addresses || ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, addr, NULL, - pinfo->prefix_len, - addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags, valid_lft, - prefered_lft, false, NULL); + ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2683,7 +2687,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) expires = jiffies_to_clock_t(rt_expires); } addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, expires, flags, GFP_ATOMIC); + 0, dev, expires, flags, + GFP_ATOMIC); } fib6_info_release(rt); } @@ -2830,10 +2835,7 @@ static int ipv6_mc_config(struct sock *sk, bool join, * Manual configuration of address on an interface */ static int inet6_addr_add(struct net *net, int ifindex, - const struct in6_addr *pfx, - const struct in6_addr *peer_pfx, - unsigned int plen, __u32 ifa_flags, - __u32 prefered_lft, __u32 valid_lft, + struct ifa6_config *cfg, struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; @@ -2841,19 +2843,18 @@ static int inet6_addr_add(struct net *net, int ifindex, struct net_device *dev; unsigned long timeout; clock_t expires; - int scope; u32 flags; ASSERT_RTNL(); - if (plen > 128) + if (cfg->plen > 128) return -EINVAL; /* check the lifetime */ - if (!valid_lft || prefered_lft > valid_lft) + if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) return -EINVAL; - if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64) + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) return -EINVAL; dev = __dev_get_by_index(net, ifindex); @@ -2864,41 +2865,40 @@ static int inet6_addr_add(struct net *net, int ifindex, if (IS_ERR(idev)) return PTR_ERR(idev); - if (ifa_flags & IFA_F_MCAUTOJOIN) { + if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, - true, pfx, ifindex); + true, cfg->pfx, ifindex); if (ret < 0) return ret; } - scope = ipv6_addr_scope(pfx); + cfg->scope = ipv6_addr_scope(cfg->pfx); - timeout = addrconf_timeout_fixup(valid_lft, HZ); + timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); if (addrconf_finite_timeout(timeout)) { expires = jiffies_to_clock_t(timeout * HZ); - valid_lft = timeout; + cfg->valid_lft = timeout; flags = RTF_EXPIRES; } else { expires = 0; flags = 0; - ifa_flags |= IFA_F_PERMANENT; + cfg->ifa_flags |= IFA_F_PERMANENT; } - timeout = addrconf_timeout_fixup(prefered_lft, HZ); + timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) - ifa_flags |= IFA_F_DEPRECATED; - prefered_lft = timeout; + cfg->ifa_flags |= IFA_F_DEPRECATED; + cfg->preferred_lft = timeout; } - ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft, true, extack); - + ifp = ipv6_add_addr(idev, cfg, true, extack); if (!IS_ERR(ifp)) { - if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags, GFP_KERNEL); + if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->rt_priority, dev, expires, + flags, GFP_KERNEL); } /* Send a netlink notification if DAD is enabled and @@ -2912,15 +2912,15 @@ static int inet6_addr_add(struct net *net, int ifindex, * manually configured addresses */ addrconf_dad_start(ifp); - if (ifa_flags & IFA_F_MANAGETEMPADDR) - manage_tempaddrs(idev, ifp, valid_lft, prefered_lft, - true, jiffies); + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR) + manage_tempaddrs(idev, ifp, cfg->valid_lft, + cfg->preferred_lft, true, jiffies); in6_ifa_put(ifp); addrconf_verify_rtnl(); return 0; - } else if (ifa_flags & IFA_F_MCAUTOJOIN) { - ipv6_mc_config(net->ipv6.mc_autojoin_sk, - false, pfx, ifindex); + } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, + cfg->pfx, ifindex); } return PTR_ERR(ifp); @@ -2971,6 +2971,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, int addrconf_add_ifaddr(struct net *net, void __user *arg) { + struct ifa6_config cfg = { + .ifa_flags = IFA_F_PERMANENT, + .preferred_lft = INFINITY_LIFE_TIME, + .valid_lft = INFINITY_LIFE_TIME, + }; struct in6_ifreq ireq; int err; @@ -2980,10 +2985,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; + cfg.pfx = &ireq.ifr6_addr; + cfg.plen = ireq.ifr6_prefixlen; + rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, - ireq.ifr6_prefixlen, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL); + err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL); rtnl_unlock(); return err; } @@ -3010,11 +3016,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int plen, int scope) { struct inet6_ifaddr *ifp; + struct ifa6_config cfg = { + .pfx = addr, + .plen = plen, + .ifa_flags = IFA_F_PERMANENT, + .valid_lft = INFINITY_LIFE_TIME, + .preferred_lft = INFINITY_LIFE_TIME, + .scope = scope + }; - ifp = ipv6_add_addr(idev, addr, NULL, plen, - scope, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, - true, NULL); + ifp = ipv6_add_addr(idev, &cfg, true, NULL); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3051,7 +3062,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (addr.s6_addr32[3]) { add_addr(idev, &addr, plen, scope); - addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags, + addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, GFP_ATOMIC); return; } @@ -3076,8 +3087,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) } add_addr(idev, &addr, plen, flag); - addrconf_prefix_route(&addr, plen, idev->dev, 0, - pflags, GFP_ATOMIC); + addrconf_prefix_route(&addr, plen, 0, idev->dev, + 0, pflags, GFP_ATOMIC); } } } @@ -3104,20 +3115,26 @@ static void init_loopback(struct net_device *dev) void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags) { + struct ifa6_config cfg = { + .pfx = addr, + .plen = 64, + .ifa_flags = flags | IFA_F_PERMANENT, + .valid_lft = INFINITY_LIFE_TIME, + .preferred_lft = INFINITY_LIFE_TIME, + .scope = IFA_LINK + }; struct inet6_ifaddr *ifp; - u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) - addr_flags |= IFA_F_OPTIMISTIC; + cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif - ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL); + ifp = ipv6_add_addr(idev, &cfg, true, NULL); if (!IS_ERR(ifp)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev, 0, 0, GFP_ATOMIC); addrconf_dad_start(ifp); in6_ifa_put(ifp); @@ -3233,7 +3250,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, + addrconf_prefix_route(&addr, 64, 0, idev->dev, 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_EUI64: @@ -3244,7 +3261,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, + addrconf_prefix_route(&addr, 64, 0, idev->dev, 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_NONE: @@ -3270,7 +3287,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IP6GRE) && (dev->type != ARPHRD_IPGRE) && (dev->type != ARPHRD_TUNNEL) && - (dev->type != ARPHRD_NONE)) { + (dev->type != ARPHRD_NONE) && + (dev->type != ARPHRD_RAWIP)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -3364,7 +3382,8 @@ static int fixup_permanent_addr(struct net *net, if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - idev->dev, 0, 0, GFP_ATOMIC); + ifp->rt_priority, idev->dev, 0, 0, + GFP_ATOMIC); } if (ifp->state == INET6_IFADDR_STATE_PREDAD) @@ -4484,6 +4503,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .len = sizeof(u32) }, + [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, }; static int @@ -4516,8 +4536,38 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifm->ifa_prefixlen); } -static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, - u32 prefered_lft, u32 valid_lft) +static int modify_prefix_route(struct inet6_ifaddr *ifp, + unsigned long expires, u32 flags) +{ + struct fib6_info *f6i; + + f6i = addrconf_get_prefix_route(&ifp->addr, + ifp->prefix_len, + ifp->idev->dev, + 0, RTF_GATEWAY | RTF_DEFAULT); + if (!f6i) + return -ENOENT; + + if (f6i->fib6_metric != ifp->rt_priority) { + /* add new one */ + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->rt_priority, ifp->idev->dev, + expires, flags, GFP_KERNEL); + /* delete old one */ + ip6_del_rt(dev_net(ifp->idev->dev), f6i); + } else { + if (!expires) + fib6_clean_expires(f6i); + else + fib6_set_expires(f6i, expires); + + fib6_info_release(f6i); + } + + return 0; +} + +static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) { u32 flags; clock_t expires; @@ -4527,32 +4577,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, ASSERT_RTNL(); - if (!valid_lft || (prefered_lft > valid_lft)) + if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) return -EINVAL; - if (ifa_flags & IFA_F_MANAGETEMPADDR && + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) return -EINVAL; if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) - ifa_flags &= ~IFA_F_OPTIMISTIC; + cfg->ifa_flags &= ~IFA_F_OPTIMISTIC; - timeout = addrconf_timeout_fixup(valid_lft, HZ); + timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); if (addrconf_finite_timeout(timeout)) { expires = jiffies_to_clock_t(timeout * HZ); - valid_lft = timeout; + cfg->valid_lft = timeout; flags = RTF_EXPIRES; } else { expires = 0; flags = 0; - ifa_flags |= IFA_F_PERMANENT; + cfg->ifa_flags |= IFA_F_PERMANENT; } - timeout = addrconf_timeout_fixup(prefered_lft, HZ); + timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) - ifa_flags |= IFA_F_DEPRECATED; - prefered_lft = timeout; + cfg->ifa_flags |= IFA_F_DEPRECATED; + cfg->preferred_lft = timeout; } spin_lock_bh(&ifp->lock); @@ -4562,19 +4612,30 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE); - ifp->flags |= ifa_flags; + ifp->flags |= cfg->ifa_flags; ifp->tstamp = jiffies; - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; + ifp->valid_lft = cfg->valid_lft; + ifp->prefered_lft = cfg->preferred_lft; + + if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) + ifp->rt_priority = cfg->rt_priority; spin_unlock_bh(&ifp->lock); if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); - if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - ifp->idev->dev, expires, flags, - GFP_KERNEL); + if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { + int rc = -ENOENT; + + if (had_prefixroute) + rc = modify_prefix_route(ifp, expires, flags); + + /* prefix route could have been deleted; if so restore it */ + if (rc == -ENOENT) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->rt_priority, ifp->idev->dev, + expires, flags, GFP_KERNEL); + } } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; @@ -4590,10 +4651,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, } if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { - if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) - valid_lft = prefered_lft = 0; - manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft, - !was_managetempaddr, jiffies); + if (was_managetempaddr && + !(ifp->flags & IFA_F_MANAGETEMPADDR)) { + cfg->valid_lft = 0; + cfg->preferred_lft = 0; + } + manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, + cfg->preferred_lft, !was_managetempaddr, + jiffies); } addrconf_verify_rtnl(); @@ -4608,12 +4673,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx, *peer_pfx; + struct in6_addr *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; struct inet6_dev *idev; - u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; - u32 ifa_flags; + struct ifa6_config cfg; int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, @@ -4621,60 +4685,70 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + memset(&cfg, 0, sizeof(cfg)); + ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); - if (!pfx) + cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); + if (!cfg.pfx) return -EINVAL; + cfg.peer_pfx = peer_pfx; + cfg.plen = ifm->ifa_prefixlen; + if (tb[IFA_RT_PRIORITY]) + cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + + cfg.valid_lft = INFINITY_LIFE_TIME; + cfg.preferred_lft = INFINITY_LIFE_TIME; + if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); - valid_lft = ci->ifa_valid; - preferred_lft = ci->ifa_prefered; - } else { - preferred_lft = INFINITY_LIFE_TIME; - valid_lft = INFINITY_LIFE_TIME; + cfg.valid_lft = ci->ifa_valid; + cfg.preferred_lft = ci->ifa_prefered; } dev = __dev_get_by_index(net, ifm->ifa_index); if (!dev) return -ENODEV; - ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + if (tb[IFA_FLAGS]) + cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); + else + cfg.ifa_flags = ifm->ifa_flags; /* We ignore other flags so far. */ - ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | + IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | + IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; idev = ipv6_find_idev(dev); if (IS_ERR(idev)) return PTR_ERR(idev); if (!ipv6_allow_optimistic_dad(net, idev)) - ifa_flags &= ~IFA_F_OPTIMISTIC; + cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; - if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) { + if (cfg.ifa_flags & IFA_F_NODAD && + cfg.ifa_flags & IFA_F_OPTIMISTIC) { NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); return -EINVAL; } - ifa = ipv6_get_ifaddr(net, pfx, dev, 1); + ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1); if (!ifa) { /* * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, - ifm->ifa_prefixlen, ifa_flags, - preferred_lft, valid_lft, extack); + return inet6_addr_add(net, ifm->ifa_index, &cfg, extack); } if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) err = -EEXIST; else - err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft); + err = inet6_addr_modify(ifa, &cfg); in6_ifa_put(ifa); @@ -4725,7 +4799,8 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(16) /* IFA_LOCAL */ + nla_total_size(16) /* IFA_ADDRESS */ + nla_total_size(sizeof(struct ifa_cacheinfo)) - + nla_total_size(4) /* IFA_FLAGS */; + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(4) /* IFA_RT_PRIORITY */; } static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, @@ -4771,6 +4846,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0) goto error; + if (ifa->rt_priority && + nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority)) + goto error; + if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) goto error; @@ -5615,7 +5694,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) - addrconf_prefix_route(&ifp->peer_addr, 128, + addrconf_prefix_route(&ifp->peer_addr, 128, 0, ifp->idev->dev, 0, 0, GFP_ATOMIC); break; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 60b0d1652448..021e5aef6ba3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -482,7 +482,8 @@ int ip6_forward(struct sk_buff *skb) send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { + if (IP6CB(skb)->iif == dst->dev->ifindex && + opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; struct rt6_info *rt; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index da66aaac51ce..00e138a44cbb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1692,8 +1692,13 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) if (new_mtu < ETH_MIN_MTU) return -EINVAL; } - if (new_mtu > 0xFFF8 - dev->hard_header_len) - return -EINVAL; + if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { + if (new_mtu > IP6_MAX_MTU - dev->hard_header_len) + return -EINVAL; + } else { + if (new_mtu > IP_MAX_MTU - dev->hard_header_len) + return -EINVAL; + } dev->mtu = new_mtu; return 0; } @@ -1841,7 +1846,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = 0xFFF8 - dev->hard_header_len; + dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 20a419ee8000..058fc05e5708 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -228,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net) INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); - if (!mrt) { - err = -ENOMEM; + if (IS_ERR(mrt)) { + err = PTR_ERR(mrt); goto err1; } @@ -302,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, static int __net_init ip6mr_rules_init(struct net *net) { - net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); - return net->ipv6.mrt6 ? 0 : -ENOMEM; + struct mr_table *mrt; + + mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); + if (IS_ERR(mrt)) + return PTR_ERR(mrt); + net->ipv6.mrt6 = mrt; + return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) @@ -1758,9 +1763,11 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns rtnl_lock(); ret = 0; - if (!ip6mr_new_table(net, v)) - ret = -ENOMEM; - raw6_sk(sk)->ip6mr_table = v; + mrt = ip6mr_new_table(net, v); + if (IS_ERR(mrt)) + ret = PTR_ERR(mrt); + else + raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9ac5366064e3..e640d2f3c55c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1578,6 +1578,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; + if (netif_is_l3_master(skb->dev)) { + dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (!dev) + return; + } + if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ce77bcc2490c..37b14dc9d863 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -29,7 +29,10 @@ config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help This option enables the IPv6 socket lookup infrastructure. This - is used by the ip6tables socket match. + is used by the {ip6,nf}tables socket match. + +config NF_TPROXY_IPV6 + tristate "IPv6 tproxy support" if NF_TABLES @@ -136,10 +139,7 @@ config NF_NAT_IPV6 if NF_NAT_IPV6 config NF_NAT_MASQUERADE_IPV6 - tristate "IPv6 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection) for IPv6. + bool endif # NF_NAT_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 44273d6f03a5..10a5a1c87320 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -18,14 +18,15 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o +nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o -obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o +obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o # logging obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 9dfc2b90c362..e6eb7cf9b54f 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -10,7 +10,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/atomic.h> #include <linux/netdevice.h> #include <linux/ipv6.h> @@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void) unregister_netdevice_notifier(&masq_dev_notifier); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c new file mode 100644 index 000000000000..bf1d6c421e3b --- /dev/null +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -0,0 +1,146 @@ +#include <net/netfilter/nf_tproxy.h> +#include <linux/module.h> +#include <net/inet6_hashtables.h> +#include <net/addrconf.h> +#include <net/udp.h> +#include <net/tcp.h> + +const struct in6_addr * +nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr) +{ + struct inet6_dev *indev; + struct inet6_ifaddr *ifa; + struct in6_addr *laddr; + + if (!ipv6_addr_any(user_laddr)) + return user_laddr; + laddr = NULL; + + indev = __in6_dev_get(skb->dev); + if (indev) { + read_lock_bh(&indev->lock); + list_for_each_entry(ifa, &indev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + + laddr = &ifa->addr; + break; + } + read_unlock_bh(&indev->lock); + } + + return laddr ? laddr : daddr; +} +EXPORT_SYMBOL_GPL(nf_tproxy_laddr6); + +struct sock * +nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + struct net *net, + const struct in6_addr *laddr, + const __be16 lport, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto, + &iph->saddr, + nf_tproxy_laddr6(skb, laddr, &iph->daddr), + hp->source, + lport ? lport : hp->dest, + skb->dev, NF_TPROXY_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} +EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6); + +struct sock * +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + struct tcphdr *tcph; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NF_TPROXY_LOOKUP_LISTENER: + tcph = hp; + sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + thoff + __tcp_hdrlen(tcph), + saddr, sport, + daddr, ntohs(dport), + in->ifindex, 0); + + if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NF_TPROXY_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex, 0); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support"); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 22c4de2317d0..1dc98715c78b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1981,7 +1981,7 @@ out: } else { keys->addrs.v6addrs.src = key_iph->saddr; keys->addrs.v6addrs.dst = key_iph->daddr; - keys->tags.flow_label = ip6_flowinfo(key_iph); + keys->tags.flow_label = ip6_flowlabel(key_iph); keys->basic.ip_proto = key_iph->nexthdr; } } @@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, } else { hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } break; @@ -4412,13 +4412,17 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - rt_last = nh->fib6_info; err = __ip6_ins_rt(nh->fib6_info, info, extack); fib6_info_release(nh->fib6_info); - /* save reference to first route for notification */ - if (!rt_notif && !err) - rt_notif = nh->fib6_info; + if (!err) { + /* save reference to last route successfully inserted */ + rt_last = nh->fib6_info; + + /* save reference to first route for notification */ + if (!rt_notif) + rt_notif = nh->fib6_info; + } /* nh->fib6_info is used or freed at this point, reset to NULL*/ nh->fib6_info = NULL; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 7f5621d09571..0fdf2a55e746 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -226,7 +226,6 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info) nla_put_failure: rcu_read_unlock(); - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -ENOMEM; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index eab39bd91548..19ccf0dc996c 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -122,7 +122,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len); + err = skb_cow_head(skb, tot_len + skb->mac_len); if (unlikely(err)) return err; @@ -181,7 +181,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen); + err = skb_cow_head(skb, hdrlen + skb->mac_len); if (unlikely(err)) return err; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2afce37a7177..e9400ffa7875 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1371,7 +1371,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; - dev->max_mtu = 0xFFF8 - t_hlen; + dev->max_mtu = IP6_MAX_MTU - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; @@ -1583,7 +1583,8 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - if (mtu >= IPV6_MIN_MTU && mtu <= 0xFFF8 - dev->hard_header_len) + if (mtu >= IPV6_MIN_MTU && + mtu <= IP6_MAX_MTU - dev->hard_header_len) dev->mtu = mtu; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7d47c2b550a9..8764a63abd91 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1322,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9f729a7b8cf0..19d7d4c24dfb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -285,9 +285,7 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ -#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ - IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \ - IS_ENABLED(CONFIG_NF_SOCKET_IPV6) +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 2cff209d0fc1..ef3defaf43b9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -124,7 +124,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi6 *fl6 = &fl->u.ip6; int onlyproto = 0; const struct ipv6hdr *hdr = ipv6_hdr(skb); - u16 offset = sizeof(*hdr); + u32 offset = sizeof(*hdr); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u16 nhoff = IP6CB(skb)->nhoff; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index dc76bc346829..d3601d421571 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1671,7 +1671,7 @@ static struct file *kcm_clone(struct socket *osock) __module_get(newsock->ops->owner); newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, - &kcm_proto, true); + &kcm_proto, false); if (!newsk) { sock_release(newsock); return ERR_PTR(-ENOMEM); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f951c768dcf2..f1c9c327f674 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -428,16 +428,6 @@ static void pppol2tp_put_sk(struct rcu_head *head) */ static void pppol2tp_session_close(struct l2tp_session *session) { - struct pppol2tp_session *ps; - - ps = l2tp_session_priv(session); - mutex_lock(&ps->sk_lock); - ps->__sk = rcu_dereference_protected(ps->sk, - lockdep_is_held(&ps->sk_lock)); - RCU_INIT_POINTER(ps->sk, NULL); - if (ps->__sk) - call_rcu(&ps->rcu, pppol2tp_put_sk); - mutex_unlock(&ps->sk_lock); } /* Really kill the session socket. (Called from sock_put() if @@ -480,15 +470,24 @@ static int pppol2tp_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; - /* If the socket is associated with a session, - * l2tp_session_delete will call pppol2tp_session_close which - * will drop the session's ref on the socket. - */ session = pppol2tp_sock_to_session(sk); if (session) { + struct pppol2tp_session *ps; + l2tp_session_delete(session); - /* drop the ref obtained by pppol2tp_sock_to_session */ - sock_put(sk); + + ps = l2tp_session_priv(session); + mutex_lock(&ps->sk_lock); + ps->__sk = rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock)); + RCU_INIT_POINTER(ps->sk, NULL); + mutex_unlock(&ps->sk_lock); + call_rcu(&ps->rcu, pppol2tp_put_sk); + + /* Rely on the sock_put() call at the end of the function for + * dropping the reference held by pppol2tp_sock_to_session(). + * The last reference will be dropped by pppol2tp_put_sk(). + */ } release_sock(sk); @@ -742,7 +741,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, */ mutex_lock(&ps->sk_lock); if (rcu_dereference_protected(ps->sk, - lockdep_is_held(&ps->sk_lock))) { + lockdep_is_held(&ps->sk_lock)) || + ps->__sk) { mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; @@ -803,7 +803,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, out_no_ppp: /* This is how we get the session context from the socket. */ - sock_hold(sk); sk->sk_user_data = session; rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index b09ef77bf4cd..82e6edf9c5d9 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -201,7 +201,6 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) return genlmsg_reply(skb, info); err: - genlmsg_cancel(skb, hdr); kfree_skb(skb); return rc; } @@ -209,7 +208,7 @@ err: static int ncsi_pkg_info_all_nl(struct sk_buff *skb, struct netlink_callback *cb) { - struct nlattr *attrs[NCSI_ATTR_MAX]; + struct nlattr *attrs[NCSI_ATTR_MAX + 1]; struct ncsi_package *np, *package; struct ncsi_dev_priv *ndp; unsigned int package_id; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index a6b7c7d5c829..930c1d3796f0 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -652,7 +652,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) NCSI_CAP_VLAN_MASK; size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN; - nc->mac_filter.addrs = kzalloc(size, GFP_KERNEL); + nc->mac_filter.addrs = kzalloc(size, GFP_ATOMIC); if (!nc->mac_filter.addrs) return -ENOMEM; nc->mac_filter.n_uc = rsp->uc_cnt; @@ -661,7 +661,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt, sizeof(*nc->vlan_filter.vids), - GFP_KERNEL); + GFP_ATOMIC); if (!nc->vlan_filter.vids) return -ENOMEM; /* Set VLAN filters active so they are cleared in the first diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a5b60e6a983e..dbd7d1fad277 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -433,11 +433,7 @@ config NF_NAT_TFTP default NF_NAT && NF_CONNTRACK_TFTP config NF_NAT_REDIRECT - tristate "IPv4/IPv6 redirect support" - depends on NF_NAT - help - This is the kernel functionality to redirect packets to local - machine through NAT. + bool config NETFILTER_SYNPROXY tristate @@ -521,6 +517,15 @@ config NFT_COUNTER This option adds the "counter" expression that you can use to include packet and byte counters in a rule. +config NFT_CONNLIMIT + tristate "Netfilter nf_tables connlimit module" + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_CONNCOUNT + help + This option adds the "connlimit" expression that you can use to + ratelimit rule matchings per connections. + config NFT_LOG tristate "Netfilter nf_tables log module" help @@ -617,6 +622,15 @@ config NFT_FIB_INET The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_SOCKET + tristate "Netfilter nf_tables socket match support" + depends on IPV6 || IPV6=n + select NF_SOCKET_IPV4 + select NF_SOCKET_IPV6 if IPV6 + help + This option allows matching for the presence or absence of a + corresponding socket and its attributes. + if NF_TABLES_NETDEV config NF_DUP_NETDEV @@ -984,6 +998,8 @@ config NETFILTER_XT_TARGET_TPROXY depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n + select NF_TPROXY_IPV4 + select NF_TPROXY_IPV6 if IP6_NF_IPTABLES help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1aa710b5d384..44449389e527 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o obj-$(CONFIG_NF_NAT) += nf_nat.o -obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o +nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o @@ -80,6 +80,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o +obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o @@ -102,6 +103,7 @@ obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NF_OSF) += nf_osf.o +obj-$(CONFIG_NFT_SOCKET) += nft_socket.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 1c98c907bc63..12d74896556a 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_out == NULL) return 1; - if (!app->pkt_out(app, cp, skb, &diff)) + if (!app->pkt_out(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL * returns false if it can't handle packet (oom) */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); + return app_tcp_pkt_out(cp, skb, app, ipvsh); /* * Call private output hook function @@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_out == NULL) return 1; - return app->pkt_out(app, cp, skb, NULL); + return app->pkt_out(app, cp, skb, NULL, ipvsh); } static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_in == NULL) return 1; - if (!app->pkt_in(app, cp, skb, &diff)) + if (!app->pkt_in(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL. * returns false if can't handle packet (oom). */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); + return app_tcp_pkt_in(cp, skb, app, ipvsh); /* * Call private input hook function @@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_in == NULL) return 1; - return app->pkt_in(app, cp, skb, NULL); + return app->pkt_in(app, cp, skb, NULL, ipvsh); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d4f68d0f7df7..af89bb5ffac7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2385,8 +2385,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ipvs_sync_daemon_cfg cfg; memset(&cfg, 0, sizeof(cfg)); - strlcpy(cfg.mcast_ifn, dm->mcast_ifn, - sizeof(cfg.mcast_ifn)); + ret = -EINVAL; + if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)) <= 0) + goto out_dec; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { @@ -2424,12 +2426,19 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) } } + if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && + strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == + IP_VS_SCHEDNAME_MAXLEN) { + ret = -EINVAL; + goto out_unlock; + } + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && usvc.protocol != IPPROTO_SCTP) { - pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n", + pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", usvc.protocol, &usvc.addr.ip, - ntohs(usvc.port), usvc.sched_name); + ntohs(usvc.port)); ret = -EFAULT; goto out_unlock; } @@ -2851,7 +2860,7 @@ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, - .len = IP_VS_IFNAME_MAXLEN }, + .len = IP_VS_IFNAME_MAXLEN - 1 }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, @@ -2869,7 +2878,7 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, - .len = IP_VS_SCHEDNAME_MAXLEN }, + .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 58d5d05aec24..4398a72edec5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -29,6 +29,8 @@ #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/ctype.h> +#include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> @@ -44,9 +46,18 @@ #include <net/ip_vs.h> -#define SERVER_STRING "227 " -#define CLIENT_STRING "PORT" +#define SERVER_STRING_PASV "227 " +#define CLIENT_STRING_PORT "PORT" +#define SERVER_STRING_EPSV "229 " +#define CLIENT_STRING_EPRT "EPRT" +enum { + IP_VS_FTP_ACTIVE = 0, + IP_VS_FTP_PORT = 0, + IP_VS_FTP_PASV, + IP_VS_FTP_EPRT, + IP_VS_FTP_EPSV, +}; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); -/* Dummy variable */ -static int ip_vs_ftp_pasv; +static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) +{ + struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); + + if ((th->doff << 2) < sizeof(struct tcphdr)) + return NULL; + return (char *)th + (th->doff << 2); +} static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) @@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) } -/* - * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern", ignoring before "skip" and terminated with - * the "term" character. - * <addr,port> is in network order. +/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern". <addr,port> is in network order. + * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, - char skip, char term, - __be32 *addr, __be16 *port, - char **start, char **end) + char skip, bool ext, int mode, + union nf_inet_addr *addr, __be16 *port, + __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; + char edelim; + __u16 hport; int i = 0; if (data_limit - data < plen) { @@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (s == data_limit) return -1; if (!found) { + /* "(" is optional for non-extended format, + * so catch the start of IPv4 address + */ + if (!ext && isdigit(*s)) + break; if (*s == skip) found = 1; } else if (*s != skip) { @@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, } } } + /* Old IPv4-only format? */ + if (!ext) { + p[0] = 0; + for (data = s; ; data++) { + if (data == data_limit) + return -1; + c = *data; + if (isdigit(c)) { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + p[i] = 0; + } else { + /* unexpected character or terminator */ + break; + } + } - for (data = s; ; data++) { - if (data == data_limit) + if (i != 5) return -1; - if (*data == term) - break; + + *start = s; + *end = data; + addr->ip = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; } - *end = data; + if (s == data_limit) + return -1; + *start = s; + edelim = *s++; + if (edelim < 33 || edelim > 126) + return -1; + if (s == data_limit) + return -1; + if (*s == edelim) { + /* Address family is usually missing for EPSV response */ + if (mode != IP_VS_FTP_EPSV) + return -1; + s++; + if (s == data_limit) + return -1; + /* Then address should be missing too */ + if (*s != edelim) + return -1; + /* Caller can pre-set addr, if needed */ + s++; + } else { + const char *ep; - memset(p, 0, sizeof(p)); - for (data = s; ; data++) { - c = *data; - if (c == term) - break; - if (c >= '0' && c <= '9') { - p[i] = p[i]*10 + c - '0'; - } else if (c == ',' && i < 5) { - i++; - } else { - /* unexpected character */ + /* We allow address only from same family */ + if (af == AF_INET6 && *s != '2') return -1; + if (af == AF_INET && *s != '1') + return -1; + s++; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; + if (s == data_limit) + return -1; + if (af == AF_INET6) { + if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; + } else { + if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; } + s = (char *) ep; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; } - - if (i != 5) + for (hport = 0; ; s++) + { + if (s == data_limit) + return -1; + if (!isdigit(*s)) + break; + hport = hport * 10 + *s - '0'; + } + if (s == data_limit || !hport || *s != edelim) return -1; - - *start = s; - *addr = get_unaligned((__be32 *) p); - *port = get_unaligned((__be16 *) (p + 4)); + s++; + *end = s; + *port = htons(hport); return 1; } -/* - * Look at outgoing ftp packets to catch the response to a PASV command +/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the @@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + * The extended format for EPSV response provides usually only port: + * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_limit; char *start, *end; union nf_inet_addr from; @@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); + if (cp->app_data == (void *) IP_VS_FTP_PASV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; + if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, - '(', ')', - &from.ip, &port, + SERVER_STRING_PASV, + sizeof(SERVER_STRING_PASV)-1, + '(', false, IP_VS_FTP_PASV, + &from, &port, cp->af, &start, &end) != 1) return 1; - IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); - /* - * Now update or create an connection entry for it + if (!data || data >= data_limit) + return 1; + + /* Usually, data address is not specified but + * we support different address, so pre-set it. */ - { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &from, port, - &cp->caddr, 0, &p); - n_cp = ip_vs_conn_out_get(&p); - } - if (!n_cp) { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, - AF_INET, IPPROTO_TCP, &cp->caddr, - 0, &cp->vaddr, port, &p); - /* As above, this is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, - IP_VS_CONN_F_NO_CPORT | - IP_VS_CONN_F_NFCT, - cp->dest, skb->mark); - if (!n_cp) - return 0; + from = cp->daddr; + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING_EPSV, + sizeof(SERVER_STRING_EPSV)-1, + '(', true, IP_VS_FTP_EPSV, + &from, &port, cp->af, + &start, &end) != 1) + return 1; - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } + IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); + } else { + return 1; + } - /* - * Replace the old passive address with the new one - */ + /* Now update or create a connection entry for it */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, + cp->af, ipvsh->protocol, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, cp->af, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* Replace the old passive address with the new one */ + if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", @@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + from = n_cp->vaddr; + port = n_cp->vport; + /* Only port, client will use VIP for the data connection */ + snprintf(buf, sizeof(buf), "|||%u|", + ntohs(port)); + } else { + *buf = 0; + } + buf_len = strlen(buf); - buf_len = strlen(buf); - - ct = nf_ct_get(skb, &ctinfo); - if (ct) { - bool mangled; - - /* If mangling fails this function will return 0 - * which will cause the packet to be dropped. - * Mangling can only fail under memory pressure, - * hopefully it will succeed on the retransmitted - * packet. - */ - mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - iph->ihl * 4, - start - data, - end - start, - buf, buf_len); - if (mangled) { - ip_vs_nfct_expect_related(skb, ct, n_cp, - IPPROTO_TCP, 0, 0); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_UNNECESSARY; - /* csum is updated */ - ret = 1; - } - } + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + bool mangled; - /* - * Not setting 'diff' is intentional, otherwise the sequence - * would be adjusted twice. + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. */ - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; + mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + ipvsh->len, + start - data, + end - start, + buf, buf_len); + if (mangled) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + ipvsh->protocol, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } - return 1; + + /* Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + cp->app_data = (void *) IP_VS_FTP_ACTIVE; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; } -/* - * Look at incoming ftp packets to catch the PASV/PORT command +/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like @@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. + * Extended format: + * "EPSV\r\n" when client requests server address from same family + * "EPSV 1\r\n" when client requests IPv4 server address + * "EPSV 2\r\n" when client requests IPv6 server address + * "EPSV ALL\r\n" - not supported + * EPRT with specified delimiter (ASCII 33..126), "|" by default: + * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport + * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; @@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* no diff required for incoming packets */ *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); + data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; while (data <= data_limit - 6) { - if (strncasecmp(data, "PASV\r\n", 6) == 0) { + if (cp->af == AF_INET && + strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; + cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } + + /* EPSV or EPSV<space><net-prt> */ + if (strncasecmp(data, "EPSV", 4) == 0 && + (data[4] == ' ' || data[4] == '\r')) { + if (data[4] == ' ') { + char proto = data[5]; + + if (data > data_limit - 7 || data[6] != '\r') + return 1; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && proto == '2') { + } else +#endif + if (cp->af == AF_INET && proto == '1') { + } else { + return 1; + } + } + /* Extended Passive mode on */ + IP_VS_DBG(7, "got EPSV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = (void *) IP_VS_FTP_EPSV; + return 1; + } + data++; } @@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, * then create a new connection entry for the coming data * connection. */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - ' ', '\r', &to.ip, &port, - &start, &end) != 1) + if (cp->af == AF_INET && + ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_PORT, + sizeof(CLIENT_STRING_PORT)-1, + ' ', false, IP_VS_FTP_PORT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", + ip_vs_proto_name(ipvsh->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, + ntohs(cp->vport)-1); + } else if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_EPRT, + sizeof(CLIENT_STRING_EPRT)-1, + ' ', true, IP_VS_FTP_EPRT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", + ip_vs_proto_name(ipvsh->protocol), + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)-1); + } else { return 1; - - IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + } /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", - ip_vs_proto_name(iph->protocol), - &to.ip, ntohs(port), &cp->vaddr.ip, 0); + cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &to, port, &cp->vaddr, + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - /* This is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, + n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; - pr_info("%s: loaded support on port[%d] = %d\n", + pr_info("%s: loaded support on port[%d] = %u\n", app->name, i, ports[i]); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 6cf3fd81a5ec..eb8b9c883889 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -67,15 +67,20 @@ #include <net/netfilter/nf_conntrack_zones.h> -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ +#define FMT_TUPLE "%s:%u->%s:%u/%u" +#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \ + ntohs((T)->src.u.all), \ + IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \ + ntohs((T)->dst.u.all), \ (T)->dst.protonum -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ +#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u" +#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \ + ntohs((C)->cport), \ + IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \ + ntohs((C)->vport), \ + IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \ + ntohs((C)->dport), \ (C)->protocol, (C)->state void @@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) new_tuple.dst.protonum != IPPROTO_ICMPV6) new_tuple.dst.u.tcp.port = cp->vport; } - IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " - "ctinfo=%d, old reply=" FMT_TUPLE - ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", - __func__, ct, ct->status, ctinfo, - ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), - ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, new reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&new_tuple)); nf_conntrack_alter_reply(ct, &new_tuple); + IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n", + __func__, ct, ARG_CONN(cp)); } int ip_vs_confirm_conntrack(struct sk_buff *skb) @@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct ip_vs_conn_param p; struct net *net = nf_ct_net(ct); - if (exp->tuple.src.l3num != PF_INET) - return; - /* * We assume that no NF locks are held before this callback. * ip_vs_conn_out_get and ip_vs_conn_in_get should match their @@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.dst.u3 = cp->vaddr; new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } @@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.src.u3 = cp->daddr; new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); + IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); return; alter: @@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, exp->expectfn = ip_vs_nfct_expect_callback; - IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); nf_ct_expect_related(exp); nf_ct_expect_put(exp); } @@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) tuple.dst.u3 = cp->vaddr; tuple.dst.u.all = cp->vport; - IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE - " for conn " FMT_CONN "\n", - __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n", + __func__, ARG_CONN(cp)); h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_kill(ct)) { - IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } else { - IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } nf_ct_put(ct); } else { - IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", - __func__, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); } } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index eff7569824e5..3250c4a1111e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_out(cp, skb); + ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ @@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_in(cp, skb); + ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 569631d2b2a1..80d10ad12a15 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index c15ef7c2a1fa..e0ef11c3691e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, /* * Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 153e690e2893..3b5059a8dcdd 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -static bool add_hlist(struct hlist_head *head, +bool nf_conncount_add(struct hlist_head *head, const struct nf_conntrack_tuple *tuple) { struct nf_conncount_tuple *conn; @@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head, hlist_add_head(&conn->node, head); return true; } +EXPORT_SYMBOL_GPL(nf_conncount_add); -static unsigned int check_hlist(struct net *net, - struct hlist_head *head, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn; @@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net, return length; } +EXPORT_SYMBOL_GPL(nf_conncount_lookup); static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], @@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root, } else { /* same source network -> be counted! */ unsigned int count; - count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + + count = nf_conncount_lookup(net, &rbconn->hhead, tuple, + zone, &addit); tree_nodes_free(root, gc_nodes, gc_count); if (!addit) return count; - if (!add_hlist(&rbconn->hhead, tuple)) + if (!nf_conncount_add(&rbconn->hhead, tuple)) return 0; /* hotdrop */ return count + 1; @@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root, continue; /* only used for GC on hhead, retval and 'addit' ignored */ - check_hlist(net, &rbconn->hhead, tuple, zone, &addit); + nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit); if (hlist_empty(&rbconn->hhead)) gc_nodes[gc_count++] = rbconn; } @@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family } EXPORT_SYMBOL_GPL(nf_conncount_init); -static void destroy_tree(struct rb_root *r) +void nf_conncount_cache_free(struct hlist_head *hhead) { struct nf_conncount_tuple *conn; - struct nf_conncount_rb *rbconn; struct hlist_node *n; + + hlist_for_each_entry_safe(conn, n, hhead, node) + kmem_cache_free(conncount_conn_cachep, conn); +} +EXPORT_SYMBOL_GPL(nf_conncount_cache_free); + +static void destroy_tree(struct rb_root *r) +{ + struct nf_conncount_rb *rbconn; struct rb_node *node; while ((node = rb_first(r)) != NULL) { @@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r) rb_erase(node, r); - hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) - kmem_cache_free(conncount_conn_cachep, conn); + nf_conncount_cache_free(&rbconn->hhead); kmem_cache_free(conncount_rb_cachep, rbconn); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 82451b7e0acb..15ed91309992 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -220,7 +220,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, enum flow_offload_tuple_dir dir; struct flow_offload *flow; struct net_device *outdev; - const struct rtable *rt; + struct rtable *rt; unsigned int thoff; struct iphdr *iph; __be32 nexthop; @@ -241,7 +241,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) @@ -264,6 +264,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); return NF_STOLEN; @@ -480,6 +481,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); return NF_STOLEN; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 821f8d835f7a..b7df32a56e7e 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1036,7 +1036,7 @@ static struct pernet_operations nat_net_ops = { .size = sizeof(struct nat_net), }; -struct nf_nat_hook nat_hook = { +static struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 7c4bb0a773ca..adee04af8d43 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -15,7 +15,6 @@ #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/types.h> @@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 87b2a77add65..ca4c4d994ddb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -28,6 +28,42 @@ static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static u64 table_handle; +enum { + NFT_VALIDATE_SKIP = 0, + NFT_VALIDATE_NEED, + NFT_VALIDATE_DO, +}; + +static u32 nft_chain_hash(const void *data, u32 len, u32 seed); +static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed); +static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *); + +static const struct rhashtable_params nft_chain_ht_params = { + .head_offset = offsetof(struct nft_chain, rhlhead), + .key_offset = offsetof(struct nft_chain, name), + .hashfn = nft_chain_hash, + .obj_hashfn = nft_chain_hash_obj, + .obj_cmpfn = nft_chain_hash_cmp, + .locks_mul = 1, + .automatic_shrinking = true, +}; + +static void nft_validate_state_update(struct net *net, u8 new_validate_state) +{ + switch (net->nft.validate_state) { + case NFT_VALIDATE_SKIP: + WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); + break; + case NFT_VALIDATE_NEED: + break; + case NFT_VALIDATE_DO: + if (new_validate_state == NFT_VALIDATE_NEED) + return; + } + + net->nft.validate_state = new_validate_state; +} + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -373,7 +409,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) @@ -546,6 +582,24 @@ done: return skb->len; } +static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +/* called with rcu_read_lock held */ static int nf_tables_gettable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -561,8 +615,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, + .module = THIS_MODULE, }; - return netlink_dump_start(nlsk, skb, nlh, &c); + + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); @@ -571,7 +627,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -678,6 +734,29 @@ err: return ret; } +static u32 nft_chain_hash(const void *data, u32 len, u32 seed) +{ + const char *name = data; + + return jhash(name, strlen(name), seed); +} + +static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct nft_chain *chain = data; + + return nft_chain_hash(chain->name, 0, seed); +} + +static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct nft_chain *chain = ptr; + const char *name = arg->key; + + return strcmp(chain->name, name); +} + static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -724,6 +803,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->name == NULL) goto err_strdup; + err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); + if (err) + goto err_chain_ht; + INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); @@ -740,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, list_add_tail_rcu(&table->list, &net->nft.tables); return 0; err_trans: + rhltable_destroy(&table->chains_ht); +err_chain_ht: kfree(table->name); err_strdup: kfree(table); @@ -880,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) { BUG_ON(ctx->table->use > 0); + rhltable_destroy(&ctx->table->chains_ht); kfree(ctx->table->name); kfree(ctx->table); } @@ -925,21 +1011,35 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) return ERR_PTR(-ENOENT); } -static struct nft_chain *nft_chain_lookup(const struct nft_table *table, +static struct nft_chain *nft_chain_lookup(struct nft_table *table, const struct nlattr *nla, u8 genmask) { + char search[NFT_CHAIN_MAXNAMELEN + 1]; + struct rhlist_head *tmp, *list; struct nft_chain *chain; if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(chain, &table->chains, list) { - if (!nla_strcmp(nla, chain->name) && - nft_active_genmask(chain, genmask)) - return chain; - } + nla_strlcpy(search, nla, sizeof(search)); - return ERR_PTR(-ENOENT); + WARN_ON(!rcu_read_lock_held() && + !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + + chain = ERR_PTR(-ENOENT); + rcu_read_lock(); + list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params); + if (!list) + goto out_unlock; + + rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) { + if (nft_active_genmask(chain, genmask)) + goto out_unlock; + } + chain = ERR_PTR(-ENOENT); +out_unlock: + rcu_read_unlock(); + return chain; } static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { @@ -1135,6 +1235,7 @@ done: return skb->len; } +/* called with rcu_read_lock held */ static int nf_tables_getchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -1142,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_table *table; const struct nft_chain *chain; + struct nft_table *table; struct sk_buff *skb2; int family = nfmsg->nfgen_family; int err; @@ -1151,8 +1252,10 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, + .module = THIS_MODULE, }; - return netlink_dump_start(nlsk, skb, nlh, &c); + + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); @@ -1167,7 +1270,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -1233,8 +1336,24 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); synchronize_rcu(); free_percpu(oldstats); - } else + } else { rcu_assign_pointer(chain->stats, newstats); + static_branch_inc(&nft_counters_enabled); + } +} + +static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) +{ + struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0); + struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1); + + if (g0 != g1) + kvfree(g1); + kvfree(g0); + + /* should be NULL either via abort or via successful commit */ + WARN_ON_ONCE(chain->rules_next); + kvfree(chain->rules_next); } static void nf_tables_chain_destroy(struct nft_ctx *ctx) @@ -1243,6 +1362,9 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) BUG_ON(chain->use > 0); + /* no concurrent access possible anymore */ + nf_tables_chain_free_chain_rules(chain); + if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); @@ -1335,6 +1457,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) module_put(hook->type->owner); } +struct nft_rules_old { + struct rcu_head h; + struct nft_rule **start; +}; + +static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain, + unsigned int alloc) +{ + if (alloc > INT_MAX) + return NULL; + + alloc += 1; /* NULL, ends rules */ + if (sizeof(struct nft_rule *) > INT_MAX / alloc) + return NULL; + + alloc *= sizeof(struct nft_rule *); + alloc += sizeof(struct nft_rules_old); + + return kvmalloc(alloc, GFP_KERNEL); +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, bool create) { @@ -1344,6 +1487,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats; struct net *net = ctx->net; struct nft_chain *chain; + struct nft_rule **rules; int err; if (table->use == UINT_MAX) @@ -1406,13 +1550,31 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err1; } + rules = nf_tables_chain_alloc_rules(chain, 0); + if (!rules) { + err = -ENOMEM; + goto err1; + } + + *rules = NULL; + rcu_assign_pointer(chain->rules_gen_0, rules); + rcu_assign_pointer(chain->rules_gen_1, rules); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err1; + err = rhltable_insert_key(&table->chains_ht, chain->name, + &chain->rhlhead, nft_chain_ht_params); + if (err) + goto err2; + err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); - if (err < 0) + if (err < 0) { + rhltable_remove(&table->chains_ht, &chain->rhlhead, + nft_chain_ht_params); goto err2; + } table->use++; list_add_tail_rcu(&chain->list, &table->chains); @@ -1849,19 +2011,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx, goto err1; } - if (ops->validate) { - const struct nft_data *data = NULL; - - err = ops->validate(ctx, expr, &data); - if (err < 0) - goto err2; - } - return 0; - -err2: - if (ops->destroy) - ops->destroy(ctx, expr); err1: expr->ops = NULL; return err; @@ -1920,7 +2070,7 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { if (handle == rule->handle) return rule; } @@ -2116,6 +2266,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) return 0; } +/* called with rcu_read_lock held */ static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2123,9 +2274,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); - const struct nft_table *table; const struct nft_chain *chain; const struct nft_rule *rule; + struct nft_table *table; struct sk_buff *skb2; int family = nfmsg->nfgen_family; int err; @@ -2134,18 +2285,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_rules, .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, }; if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { struct nft_rule_dump_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); if (!ctx) return -ENOMEM; if (nla[NFTA_RULE_TABLE]) { ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_KERNEL); + GFP_ATOMIC); if (!ctx->table) { kfree(ctx); return -ENOMEM; @@ -2153,7 +2305,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, } if (nla[NFTA_RULE_CHAIN]) { ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_KERNEL); + GFP_ATOMIC); if (!ctx->chain) { kfree(ctx->table); kfree(ctx); @@ -2163,7 +2315,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, c.data = ctx; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); @@ -2184,7 +2336,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return PTR_ERR(rule); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -2225,6 +2377,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, nf_tables_rule_destroy(ctx, rule); } +int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) +{ + struct nft_expr *expr, *last; + const struct nft_data *data; + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &chain->rules, list) { + if (!nft_is_active_next(ctx->net, rule)) + continue; + + nft_rule_for_each_expr(expr, last, rule) { + if (!expr->ops->validate) + continue; + + err = expr->ops->validate(ctx, expr, &data); + if (err < 0) + return err; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_chain_validate); + +static int nft_table_validate(struct net *net, const struct nft_table *table) +{ + struct nft_chain *chain; + struct nft_ctx ctx = { + .net = net, + .family = table->family, + }; + int err; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; + + ctx.chain = chain; + err = nft_chain_validate(&ctx, chain); + if (err < 0) + return err; + } + + return 0; +} + #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -2352,6 +2551,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, err = nf_tables_newexpr(&ctx, &info[i], expr); if (err < 0) goto err2; + + if (info[i].ops->validate) + nft_validate_state_update(net, NFT_VALIDATE_NEED); + info[i].ops = NULL; expr = nft_expr_next(expr); } @@ -2395,8 +2598,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } chain->use++; - return 0; + if (net->nft.validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, table); + + return 0; err2: nf_tables_rule_release(&ctx, rule); err1: @@ -2655,7 +2861,7 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -2781,7 +2987,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return 0; } -static u64 nf_jiffies64_to_msecs(u64 input) +static __be64 nf_jiffies64_to_msecs(u64 input) { u64 ms = jiffies64_to_nsecs(input); @@ -2960,6 +3166,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) return 0; } +/* called with rcu_read_lock held */ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2982,17 +3189,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_sets, .done = nf_tables_dump_sets_done, + .module = THIS_MODULE, }; struct nft_ctx *ctx_dump; - ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL); + ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC); if (ctx_dump == NULL) return -ENOMEM; *ctx_dump = ctx; c.data = ctx_dump; - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } /* Only accept unspec with dump */ @@ -3005,7 +3213,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; @@ -3219,6 +3427,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + set->table = table; + write_pnet(&set->net, net); set->ops = ops; set->ktype = ktype; set->klen = desc.klen; @@ -3746,7 +3956,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, ext = nft_set_elem_ext(set, &elem); err = -ENOMEM; - skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) goto err1; @@ -3768,6 +3978,7 @@ err1: return err == -EAGAIN ? -ENOBUFS : err; } +/* called with rcu_read_lock held */ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3792,10 +4003,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, + .module = THIS_MODULE, }; struct nft_set_dump_ctx *dump_ctx; - dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC); if (!dump_ctx) return -ENOMEM; @@ -3803,7 +4015,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, dump_ctx->ctx = ctx; c.data = dump_ctx; - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) @@ -3894,12 +4106,24 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_ctx ctx = { + .net = read_pnet(&set->net), + .family = set->table->family, + }; nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { + struct nft_expr *expr = nft_set_ext_expr(ext); + + if (expr->ops->destroy_clone) { + expr->ops->destroy_clone(&ctx, expr); + module_put(expr->ops->type->owner); + } else { + nf_tables_expr_destroy(&ctx, expr); + } + } if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) (*nft_set_ext_obj(ext))->use--; kfree(elem); @@ -3909,12 +4133,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy); /* Only called from commit path, nft_set_elem_deactivate() already deals with * the refcounting from the preparation phase. */ -static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) +static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext)); kfree(elem); } @@ -4034,6 +4259,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, d2.type, d2.len); if (err < 0) goto err3; + + if (d2.type == NFT_DATA_VERDICT && + (data.verdict.code == NFT_GOTO || + data.verdict.code == NFT_JUMP)) + nft_validate_state_update(ctx->net, + NFT_VALIDATE_NEED); } nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); @@ -4133,7 +4364,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err = 0; + int rem, err; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; @@ -4154,9 +4385,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); if (err < 0) - break; + return err; } - return err; + + if (net->nft.validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, ctx.table); + + return 0; } /** @@ -4426,7 +4661,7 @@ struct nft_object *nft_obj_lookup(const struct nft_table *table, { struct nft_object *obj; - list_for_each_entry(obj, &table->objects, list) { + list_for_each_entry_rcu(obj, &table->objects, list) { if (!nla_strcmp(nla, obj->name) && objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) @@ -4635,7 +4870,7 @@ err3: kfree(obj->name); err2: if (obj->ops->destroy) - obj->ops->destroy(obj); + obj->ops->destroy(&ctx, obj); kfree(obj); err1: module_put(type->owner); @@ -4711,7 +4946,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table[0] && + if (filter && filter->table && strcmp(filter->table, table->name)) goto cont; if (filter && @@ -4756,12 +4991,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) { struct nft_obj_filter *filter; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return ERR_PTR(-ENOMEM); if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL); + filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); if (!filter->table) { kfree(filter); return ERR_PTR(-ENOMEM); @@ -4773,6 +5008,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) return filter; } +/* called with rcu_read_lock held */ static int nf_tables_getobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -4792,6 +5028,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, }; if (nla[NFTA_OBJ_TABLE] || @@ -4804,7 +5041,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, c.data = filter; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_OBJ_NAME] || @@ -4824,7 +5061,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, return PTR_ERR(obj); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -4843,10 +5080,10 @@ err: return err; } -static void nft_obj_destroy(struct nft_object *obj) +static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { if (obj->ops->destroy) - obj->ops->destroy(obj); + obj->ops->destroy(ctx, obj); module_put(obj->ops->type->owner); kfree(obj->name); @@ -4969,7 +5206,7 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, { struct nft_flowtable *flowtable; - list_for_each_entry(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -5389,7 +5626,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table[0] && + if (filter && filter->table && strcmp(filter->table, table->name)) goto cont; @@ -5430,13 +5667,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[]) { struct nft_flowtable_filter *filter; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return ERR_PTR(-ENOMEM); if (nla[NFTA_FLOWTABLE_TABLE]) { filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], - GFP_KERNEL); + GFP_ATOMIC); if (!filter->table) { kfree(filter); return ERR_PTR(-ENOMEM); @@ -5445,6 +5682,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[]) return filter; } +/* called with rcu_read_lock held */ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -5463,6 +5701,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_flowtable, .done = nf_tables_dump_flowtable_done, + .module = THIS_MODULE, }; if (nla[NFTA_FLOWTABLE_TABLE]) { @@ -5474,7 +5713,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, c.data = filter; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) @@ -5490,7 +5729,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (IS_ERR(flowtable)) return PTR_ERR(flowtable); - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -5654,7 +5893,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, struct sk_buff *skb2; int err; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; @@ -5676,7 +5915,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { - .call = nf_tables_gettable, + .call_rcu = nf_tables_gettable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, @@ -5691,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { - .call = nf_tables_getchain, + .call_rcu = nf_tables_getchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -5706,7 +5945,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { - .call = nf_tables_getrule, + .call_rcu = nf_tables_getrule, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, @@ -5721,7 +5960,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { - .call = nf_tables_getset, + .call_rcu = nf_tables_getset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, @@ -5736,7 +5975,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { - .call = nf_tables_getsetelem, + .call_rcu = nf_tables_getsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, @@ -5746,7 +5985,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { - .call = nf_tables_getgen, + .call_rcu = nf_tables_getgen, }, [NFT_MSG_NEWOBJ] = { .call_batch = nf_tables_newobj, @@ -5754,7 +5993,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { - .call = nf_tables_getobj, + .call_rcu = nf_tables_getobj, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, @@ -5764,7 +6003,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call_rcu = nf_tables_getobj, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, @@ -5774,7 +6013,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { - .call = nf_tables_getflowtable, + .call_rcu = nf_tables_getflowtable, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, @@ -5785,12 +6024,41 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { }, }; +static int nf_tables_validate(struct net *net) +{ + struct nft_table *table; + + switch (net->nft.validate_state) { + case NFT_VALIDATE_SKIP: + break; + case NFT_VALIDATE_NEED: + nft_validate_state_update(net, NFT_VALIDATE_DO); + /* fall through */ + case NFT_VALIDATE_DO: + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_validate(net, table) < 0) + return -EAGAIN; + } + break; + } + + return 0; +} + static void nft_chain_commit_update(struct nft_trans *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)) + if (nft_trans_chain_name(trans)) { + rhltable_remove(&trans->ctx.table->chains_ht, + &trans->ctx.chain->rhlhead, + nft_chain_ht_params); swap(trans->ctx.chain->name, nft_trans_chain_name(trans)); + rhltable_insert_key(&trans->ctx.table->chains_ht, + trans->ctx.chain->name, + &trans->ctx.chain->rhlhead, + nft_chain_ht_params); + } if (!nft_is_base_chain(trans->ctx.chain)) return; @@ -5822,11 +6090,12 @@ static void nft_commit_release(struct nft_trans *trans) nft_set_destroy(nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nf_tables_set_elem_destroy(nft_trans_elem_set(trans), + nf_tables_set_elem_destroy(&trans->ctx, + nft_trans_elem_set(trans), nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: - nft_obj_destroy(nft_trans_obj(trans)); + nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); @@ -5850,21 +6119,175 @@ static void nf_tables_commit_release(struct net *net) } } +static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) +{ + struct nft_rule *rule; + unsigned int alloc = 0; + int i; + + /* already handled or inactive chain? */ + if (chain->rules_next || !nft_is_active_next(net, chain)) + return 0; + + rule = list_entry(&chain->rules, struct nft_rule, list); + i = 0; + + list_for_each_entry_continue(rule, &chain->rules, list) { + if (nft_is_active_next(net, rule)) + alloc++; + } + + chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc); + if (!chain->rules_next) + return -ENOMEM; + + list_for_each_entry_continue(rule, &chain->rules, list) { + if (nft_is_active_next(net, rule)) + chain->rules_next[i++] = rule; + } + + chain->rules_next[i] = NULL; + return 0; +} + +static void nf_tables_commit_chain_prepare_cancel(struct net *net) +{ + struct nft_trans *trans, *next; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWRULE || + trans->msg_type == NFT_MSG_DELRULE) { + kvfree(chain->rules_next); + chain->rules_next = NULL; + } + } +} + +static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h) +{ + struct nft_rules_old *o = container_of(h, struct nft_rules_old, h); + + kvfree(o->start); +} + +static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) +{ + struct nft_rule **r = rules; + struct nft_rules_old *old; + + while (*r) + r++; + + r++; /* rcu_head is after end marker */ + old = (void *) r; + old->start = rules; + + call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); +} + +static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain) +{ + struct nft_rule **g0, **g1; + bool next_genbit; + + next_genbit = nft_gencursor_next(net); + + g0 = rcu_dereference_protected(chain->rules_gen_0, + lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + g1 = rcu_dereference_protected(chain->rules_gen_1, + lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + + /* No changes to this chain? */ + if (chain->rules_next == NULL) { + /* chain had no change in last or next generation */ + if (g0 == g1) + return; + /* + * chain had no change in this generation; make sure next + * one uses same rules as current generation. + */ + if (next_genbit) { + rcu_assign_pointer(chain->rules_gen_1, g0); + nf_tables_commit_chain_free_rules_old(g1); + } else { + rcu_assign_pointer(chain->rules_gen_0, g1); + nf_tables_commit_chain_free_rules_old(g0); + } + + return; + } + + if (next_genbit) + rcu_assign_pointer(chain->rules_gen_1, chain->rules_next); + else + rcu_assign_pointer(chain->rules_gen_0, chain->rules_next); + + chain->rules_next = NULL; + + if (g0 == g1) + return; + + if (next_genbit) + nf_tables_commit_chain_free_rules_old(g1); + else + nf_tables_commit_chain_free_rules_old(g0); +} + +static void nft_chain_del(struct nft_chain *chain) +{ + struct nft_table *table = chain->table; + + WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead, + nft_chain_ht_params)); + list_del_rcu(&chain->list); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; struct nft_trans_elem *te; + struct nft_chain *chain; + struct nft_table *table; - /* Bump generation counter, invalidate any dump in progress */ - while (++net->nft.base_seq == 0); + /* 0. Validate ruleset, otherwise roll back for error reporting. */ + if (nf_tables_validate(net) < 0) + return -EAGAIN; - /* A new generation has just started */ - net->nft.gencursor = nft_gencursor_next(net); + /* 1. Allocate space for next generation rules_gen_X[] */ + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + int ret; + + if (trans->msg_type == NFT_MSG_NEWRULE || + trans->msg_type == NFT_MSG_DELRULE) { + chain = trans->ctx.chain; + + ret = nf_tables_commit_chain_prepare(net, chain); + if (ret < 0) { + nf_tables_commit_chain_prepare_cancel(net); + return ret; + } + } + } - /* Make sure all packets have left the previous generation before - * purging old rules. + /* step 2. Make rules_gen_X visible to packet path */ + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + nf_tables_commit_chain_active(net, chain); + } + } + + /* + * Bump generation counter, invalidate any dump in progress. + * Cannot fail after this point. */ - synchronize_rcu(); + while (++net->nft.base_seq == 0); + + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -5895,7 +6318,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELCHAIN: - list_del_rcu(&trans->ctx.chain->list); + nft_chain_del(trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, @@ -6006,7 +6429,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_trans_elem(trans).priv, true); break; case NFT_MSG_NEWOBJ: - nft_obj_destroy(nft_trans_obj(trans)); + nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); @@ -6046,7 +6469,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); } else { trans->ctx.table->use--; - list_del_rcu(&trans->ctx.chain->list); + nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, trans->ctx.chain); @@ -6126,6 +6549,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_cleanup(struct net *net) +{ + nft_validate_state_update(net, NFT_VALIDATE_SKIP); +} + static bool nf_tables_valid_genid(struct net *net, u32 genid) { return net->nft.base_seq == genid; @@ -6138,6 +6566,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, + .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, }; @@ -6221,19 +6650,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, list_for_each_entry(rule, &chain->rules, list) { nft_rule_for_each_expr(expr, last, rule) { - const struct nft_data *data = NULL; + struct nft_immediate_expr *priv; + const struct nft_data *data; int err; - if (!expr->ops->validate) + if (strcmp(expr->ops->type->name, "immediate")) continue; - err = expr->ops->validate(ctx, expr, &data); - if (err < 0) - return err; - - if (data == NULL) + priv = nft_expr_priv(expr); + if (priv->dreg != NFT_REG_VERDICT) continue; + data = &priv->data; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: @@ -6643,7 +7071,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) ctx->chain->use--; nf_tables_rule_release(ctx, rule); } - list_del(&ctx->chain->list); + nft_chain_del(ctx->chain); ctx->table->use--; nf_tables_chain_destroy(ctx); @@ -6695,11 +7123,11 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(obj, ne, &table->objects, list) { list_del(&obj->list); table->use--; - nft_obj_destroy(obj); + nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { ctx.chain = chain; - list_del(&chain->list); + nft_chain_del(chain); table->use--; nf_tables_chain_destroy(&ctx); } @@ -6713,6 +7141,8 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); net->nft.base_seq = 1; + net->nft.validate_state = NFT_VALIDATE_SKIP; + return 0; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 4f46d2f4e167..deff10adef9c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -23,22 +23,6 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> -static const char *const comments[__NFT_TRACETYPE_MAX] = { - [NFT_TRACETYPE_POLICY] = "policy", - [NFT_TRACETYPE_RETURN] = "return", - [NFT_TRACETYPE_RULE] = "rule", -}; - -static const struct nf_loginfo trace_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) @@ -120,20 +104,20 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, if (!base_chain->stats) return; + local_bh_disable(); stats = this_cpu_ptr(rcu_dereference(base_chain->stats)); if (stats) { - local_bh_disable(); u64_stats_update_begin(&stats->syncp); stats->pkts++; stats->bytes += pkt->skb->len; u64_stats_update_end(&stats->syncp); - local_bh_enable(); } + local_bh_enable(); } struct nft_jumpstack { const struct nft_chain *chain; - const struct nft_rule *rule; + struct nft_rule *const *rules; }; unsigned int @@ -141,27 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; const struct net *net = nft_net(pkt); + struct nft_rule *const *rules; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - unsigned int gencursor = nft_genmask_cur(net); + bool genbit = READ_ONCE(net->nft.gencursor); struct nft_traceinfo info; info.trace = false; if (static_branch_unlikely(&nft_trace_enabled)) nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: - rule = list_entry(&chain->rules, struct nft_rule, list); + if (genbit) + rules = rcu_dereference(chain->rules_gen_1); + else + rules = rcu_dereference(chain->rules_gen_0); + next_rule: + rule = *rules; regs.verdict.code = NFT_CONTINUE; - list_for_each_entry_continue_rcu(rule, &chain->rules, list) { - - /* This rule is not active, skip. */ - if (unlikely(rule->genmask & gencursor)) - continue; - + for (; *rules ; rules++) { + rule = *rules; nft_rule_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); @@ -199,7 +185,7 @@ next_rule: case NFT_JUMP: BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); jumpstack[stackptr].chain = chain; - jumpstack[stackptr].rule = rule; + jumpstack[stackptr].rules = rules + 1; stackptr++; /* fall through */ case NFT_GOTO: @@ -221,7 +207,7 @@ next_rule: if (stackptr > 0) { stackptr--; chain = jumpstack[stackptr].chain; - rule = jumpstack[stackptr].rule; + rules = jumpstack[stackptr].rules; goto next_rule; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 03ead8a9e90c..4d0da7042aff 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -25,6 +25,7 @@ #include <linux/uaccess.h> #include <net/sock.h> #include <linux/init.h> +#include <linux/sched/signal.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> @@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) +#define NFNL_MAX_ATTR_COUNT 32 + static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; @@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { + u8 cb_id; + + /* Sanity-check attr_count size to avoid stack buffer overflow. */ + for (cb_id = 0; cb_id < n->cb_count; cb_id++) + if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT)) + return -EINVAL; + nfnl_lock(n->subsys_id); if (table[n->subsys_id].subsys) { nfnl_unlock(n->subsys_id); @@ -185,11 +195,17 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); + /* Sanity-check NFNL_MAX_ATTR_COUNT */ + if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { + rcu_read_unlock(); + return -ENOMEM; + } + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, extack); if (err < 0) { @@ -330,6 +346,13 @@ replay: while (skb->len >= nlmsg_total_size(0)) { int msglen, type; + if (fatal_signal_pending(current)) { + nfnl_err_reset(&err_list); + err = -EINTR; + status = NFNL_BATCH_FAILURE; + goto done; + } + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -379,10 +402,16 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; + /* Sanity-check NFTA_MAX_ATTR */ + if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { + err = -ENOMEM; + goto ack; + } + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, NULL); if (err < 0) @@ -441,10 +470,19 @@ done: kfree_skb(skb); goto replay; } else if (status == NFNL_BATCH_DONE) { - ss->commit(net, oskb); + err = ss->commit(net, oskb); + if (err == -EAGAIN) { + status |= NFNL_BATCH_REPLAY; + goto done; + } else if (err) { + ss->abort(net, oskb); + netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } } else { ss->abort(net, oskb); } + if (ss->cleanup) + ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 6ddf89183e7b..a0e5adf0b3b6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); + nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index fa026b269b36..cb5b5f207777 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -150,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, return -EINVAL; nla_strlcpy(expect_policy->name, - nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); + tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) @@ -235,7 +235,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], goto err1; nla_strlcpy(helper->name, - nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); + tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > FIELD_SIZEOF(struct nf_conn_help, data)) { ret = -ENOMEM; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1d99a1efdafc..8d1ff654e5af 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -611,10 +611,10 @@ nla_put_failure: return -1; } -static int nfnl_compat_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { int ret = 0, target; struct nfgenmsg *nfmsg; @@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl, return -EINVAL; } + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, rev, target, &ret), fmt, name); - if (ret < 0) - return ret; + goto out_put; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) - return -ENOMEM; + if (skb2 == NULL) { + ret = -ENOMEM; + goto out_put; + } /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, @@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl, nfmsg->nfgen_family, name, ret, target) <= 0) { kfree_skb(skb2); - return -ENOSPC; + goto out_put; } ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret > 0) ret = 0; - +out_put: + rcu_read_lock(); + module_put(THIS_MODULE); return ret == -EAGAIN ? -ENOBUFS : ret; } @@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { - [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get, + [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu, .attr_count = NFTA_COMPAT_MAX, .policy = nfnl_compat_policy_get }, }; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c new file mode 100644 index 000000000000..50c068d660e5 --- /dev/null +++ b/net/netfilter/nft_connlimit.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_count.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> + +struct nft_connlimit { + spinlock_t lock; + struct hlist_head hhead; + u32 limit; + bool invert; +}; + +static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct nft_set_ext *ext) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + const struct nf_conntrack_tuple *tuple_ptr; + struct nf_conntrack_tuple tuple; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int count; + bool addit; + + tuple_ptr = &tuple; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (ct != NULL) { + tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + zone = nf_ct_zone(ct); + } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), + nft_pf(pkt), nft_net(pkt), &tuple)) { + regs->verdict.code = NF_DROP; + return; + } + + spin_lock_bh(&priv->lock); + count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone, + &addit); + + if (!addit) + goto out; + + if (!nf_conncount_add(&priv->hhead, tuple_ptr)) { + regs->verdict.code = NF_DROP; + spin_unlock_bh(&priv->lock); + return; + } + count++; +out: + spin_unlock_bh(&priv->lock); + + if ((count > priv->limit) ^ priv->invert) { + regs->verdict.code = NFT_BREAK; + return; + } +} + +static int nft_connlimit_do_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_connlimit *priv) +{ + bool invert = false; + u32 flags, limit; + + if (!tb[NFTA_CONNLIMIT_COUNT]) + return -EINVAL; + + limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT])); + + if (tb[NFTA_CONNLIMIT_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS])); + if (flags & ~NFT_CONNLIMIT_F_INV) + return -EOPNOTSUPP; + if (flags & NFT_CONNLIMIT_F_INV) + invert = true; + } + + spin_lock_init(&priv->lock); + INIT_HLIST_HEAD(&priv->hhead); + priv->limit = limit; + priv->invert = invert; + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, + struct nft_connlimit *priv) +{ + nf_ct_netns_put(ctx->net, ctx->family); + nf_conncount_cache_free(&priv->hhead); +} + +static int nft_connlimit_do_dump(struct sk_buff *skb, + struct nft_connlimit *priv) +{ + if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit))) + goto nla_put_failure; + if (priv->invert && + nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static inline void nft_connlimit_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_connlimit *priv = nft_obj_data(obj); + + nft_connlimit_do_eval(priv, regs, pkt, NULL); +} + +static int nft_connlimit_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_connlimit *priv = nft_obj_data(obj); + + return nft_connlimit_do_init(ctx, tb, priv); +} + +static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + struct nft_connlimit *priv = nft_obj_data(obj); + + nft_connlimit_do_destroy(ctx, priv); +} + +static int nft_connlimit_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + struct nft_connlimit *priv = nft_obj_data(obj); + + return nft_connlimit_do_dump(skb, priv); +} + +static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = { + [NFTA_CONNLIMIT_COUNT] = { .type = NLA_U32 }, + [NFTA_CONNLIMIT_FLAGS] = { .type = NLA_U32 }, +}; + +static struct nft_object_type nft_connlimit_obj_type; +static const struct nft_object_ops nft_connlimit_obj_ops = { + .type = &nft_connlimit_obj_type, + .size = sizeof(struct nft_connlimit), + .eval = nft_connlimit_obj_eval, + .init = nft_connlimit_obj_init, + .destroy = nft_connlimit_obj_destroy, + .dump = nft_connlimit_obj_dump, +}; + +static struct nft_object_type nft_connlimit_obj_type __read_mostly = { + .type = NFT_OBJECT_CONNLIMIT, + .ops = &nft_connlimit_obj_ops, + .maxattr = NFTA_CONNLIMIT_MAX, + .policy = nft_connlimit_policy, + .owner = THIS_MODULE, +}; + +static void nft_connlimit_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + + nft_connlimit_do_eval(priv, regs, pkt, NULL); +} + +static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + + return nft_connlimit_do_dump(skb, priv); +} + +static int nft_connlimit_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + + return nft_connlimit_do_init(ctx, tb, priv); +} + +static void nft_connlimit_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + + nft_connlimit_do_destroy(ctx, priv); +} + +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_connlimit *priv_dst = nft_expr_priv(dst); + struct nft_connlimit *priv_src = nft_expr_priv(src); + + spin_lock_init(&priv_dst->lock); + INIT_HLIST_HEAD(&priv_dst->hhead); + priv_dst->limit = priv_src->limit; + priv_dst->invert = priv_src->invert; + + return 0; +} + +static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + + nf_conncount_cache_free(&priv->hhead); +} + +static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) +{ + struct nft_connlimit *priv = nft_expr_priv(expr); + bool addit, ret; + + spin_lock_bh(&priv->lock); + nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit); + + ret = hlist_empty(&priv->hhead); + spin_unlock_bh(&priv->lock); + + return ret; +} + +static struct nft_expr_type nft_connlimit_type; +static const struct nft_expr_ops nft_connlimit_ops = { + .type = &nft_connlimit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_connlimit)), + .eval = nft_connlimit_eval, + .init = nft_connlimit_init, + .destroy = nft_connlimit_destroy, + .clone = nft_connlimit_clone, + .destroy_clone = nft_connlimit_destroy_clone, + .dump = nft_connlimit_dump, + .gc = nft_connlimit_gc, +}; + +static struct nft_expr_type nft_connlimit_type __read_mostly = { + .name = "connlimit", + .ops = &nft_connlimit_ops, + .policy = nft_connlimit_policy, + .maxattr = NFTA_CONNLIMIT_MAX, + .flags = NFT_EXPR_STATEFUL | NFT_EXPR_GC, + .owner = THIS_MODULE, +}; + +static int __init nft_connlimit_module_init(void) +{ + int err; + + err = nft_register_obj(&nft_connlimit_obj_type); + if (err < 0) + return err; + + err = nft_register_expr(&nft_connlimit_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_connlimit_obj_type); + return err; +} + +static void __exit nft_connlimit_module_exit(void) +{ + nft_unregister_expr(&nft_connlimit_type); + nft_unregister_obj(&nft_connlimit_obj_type); +} + +module_init(nft_connlimit_module_init); +module_exit(nft_connlimit_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso"); +MODULE_ALIAS_NFT_EXPR("connlimit"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index eefe3b409925..a61d7edfc290 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv) free_percpu(priv->counter); } -static void nft_counter_obj_destroy(struct nft_object *obj) +static void nft_counter_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); @@ -257,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = { .eval = nft_counter_eval, .init = nft_counter_init, .destroy = nft_counter_destroy, + .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index ea737fd789e8..1435ffc5f57e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, return 0; } -static void nft_ct_helper_obj_destroy(struct nft_object *obj) +static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); @@ -880,22 +881,26 @@ static int nft_ct_helper_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); - const struct nf_conntrack_helper *helper = priv->helper4; + const struct nf_conntrack_helper *helper; u16 family; + if (priv->helper4 && priv->helper6) { + family = NFPROTO_INET; + helper = priv->helper4; + } else if (priv->helper6) { + family = NFPROTO_IPV6; + helper = priv->helper6; + } else { + family = NFPROTO_IPV4; + helper = priv->helper4; + } + if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) return -1; if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) return -1; - if (priv->helper4 && priv->helper6) - family = NFPROTO_INET; - else if (priv->helper6) - family = NFPROTO_IPV6; - else - family = NFPROTO_IPV4; - if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) return -1; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index b07a3fd9eeea..4d49529cff61 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, err = -EOPNOTSUPP; if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) goto err1; + + if (priv->expr->ops->type->flags & NFT_EXPR_GC) { + if (set->flags & NFT_SET_TIMEOUT) + goto err1; + if (!set->ops->gc_init) + goto err1; + set->ops->gc_init(set); + } + } else if (set->flags & NFT_SET_EVAL) return -EINVAL; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index ce13a50b9189..8abb9891cdf2 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -12,8 +12,12 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/ip.h> +#include <linux/ipv6.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_dup_netdev.h> +#include <net/neighbour.h> +#include <net/ip.h> struct nft_fwd_netdev { enum nft_registers sreg_dev:8; @@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, + [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, @@ -62,7 +68,133 @@ nla_put_failure: return -1; } +struct nft_fwd_neigh { + enum nft_registers sreg_dev:8; + enum nft_registers sreg_addr:8; + u8 nfproto; +}; + +static void nft_fwd_neigh_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + void *addr = ®s->data[priv->sreg_addr]; + int oif = regs->data[priv->sreg_dev]; + unsigned int verdict = NF_STOLEN; + struct sk_buff *skb = pkt->skb; + struct net_device *dev; + int neigh_table; + + switch (priv->nfproto) { + case NFPROTO_IPV4: { + struct iphdr *iph; + + if (skb->protocol != htons(ETH_P_IP)) { + verdict = NFT_BREAK; + goto out; + } + if (skb_try_make_writable(skb, sizeof(*iph))) { + verdict = NF_DROP; + goto out; + } + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + neigh_table = NEIGH_ARP_TABLE; + break; + } + case NFPROTO_IPV6: { + struct ipv6hdr *ip6h; + + if (skb->protocol != htons(ETH_P_IPV6)) { + verdict = NFT_BREAK; + goto out; + } + if (skb_try_make_writable(skb, sizeof(*ip6h))) { + verdict = NF_DROP; + goto out; + } + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + neigh_table = NEIGH_ND_TABLE; + break; + } + default: + verdict = NFT_BREAK; + goto out; + } + + dev = dev_get_by_index_rcu(nft_net(pkt), oif); + if (dev == NULL) + return; + + skb->dev = dev; + neigh_xmit(neigh_table, dev, addr, skb); +out: + regs->verdict.code = verdict; +} + +static int nft_fwd_neigh_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + unsigned int addr_len; + int err; + + if (!tb[NFTA_FWD_SREG_DEV] || + !tb[NFTA_FWD_SREG_ADDR] || + !tb[NFTA_FWD_NFPROTO]) + return -EINVAL; + + priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); + priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); + priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); + + switch (priv->nfproto) { + case NFPROTO_IPV4: + addr_len = sizeof(struct in_addr); + break; + case NFPROTO_IPV6: + addr_len = sizeof(struct in6_addr); + break; + default: + return -EOPNOTSUPP; + } + + err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + if (err < 0) + return err; + + return nft_validate_register_load(priv->sreg_addr, addr_len); +} + +static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; + +static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) || + nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) || + nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_fwd_netdev_type; +static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { + .type = &nft_fwd_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)), + .eval = nft_fwd_neigh_eval, + .init = nft_fwd_neigh_init, + .dump = nft_fwd_neigh_dump, +}; + static const struct nft_expr_ops nft_fwd_netdev_ops = { .type = &nft_fwd_netdev_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), @@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, }; +static const struct nft_expr_ops * +nft_fwd_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_FWD_SREG_ADDR]) + return &nft_fwd_neigh_netdev_ops; + if (tb[NFTA_FWD_SREG_DEV]) + return &nft_fwd_netdev_ops; + + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { .family = NFPROTO_NETDEV, .name = "fwd", - .ops = &nft_fwd_netdev_ops, + .select_ops = nft_fwd_select_ops, .policy = nft_fwd_netdev_policy, .maxattr = NFTA_FWD_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index f0fc21f88775..c2d237144f74 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -177,10 +177,7 @@ static int nft_jhash_map_init(const struct nft_ctx *ctx, priv->map = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_HASH_SET_NAME], tb[NFTA_HASH_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -220,10 +217,7 @@ static int nft_symhash_map_init(const struct nft_ctx *ctx, priv->map = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_HASH_SET_NAME], tb[NFTA_HASH_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_jhash_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index aa87ff8beae8..15adf8ca82c3 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -17,12 +17,6 @@ #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> -struct nft_immediate_expr { - struct nft_data data; - enum nft_registers dreg:8; - u8 dlen; -}; - static void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -101,12 +95,27 @@ nla_put_failure: static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_data **d) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data; + int err; - if (priv->dreg == NFT_REG_VERDICT) - *data = &priv->data; + if (priv->dreg != NFT_REG_VERDICT) + return 0; + + data = &priv->data; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + break; + default: + break; + } return 0; } diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index a9fc298ef4c3..72f13a1144dd 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -51,10 +51,13 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) return !limit->invert; } +/* Use same default as in iptables. */ +#define NFT_LIMIT_PKT_BURST_DEFAULT 5 + static int nft_limit_init(struct nft_limit *limit, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], bool pkts) { - u64 unit; + u64 unit, tokens; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) @@ -68,18 +71,25 @@ static int nft_limit_init(struct nft_limit *limit, if (tb[NFTA_LIMIT_BURST]) limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); - else - limit->burst = 0; + + if (pkts && limit->burst == 0) + limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT; if (limit->rate + limit->burst < limit->rate) return -EOVERFLOW; - /* The token bucket size limits the number of tokens can be - * accumulated. tokens_max specifies the bucket size. - * tokens_max = unit * (rate + burst) / rate. - */ - limit->tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), - limit->rate); + if (pkts) { + tokens = div_u64(limit->nsecs, limit->rate) * limit->burst; + } else { + /* The token bucket size limits the number of tokens can be + * accumulated. tokens_max specifies the bucket size. + * tokens_max = unit * (rate + burst) / rate. + */ + tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), + limit->rate); + } + + limit->tokens = tokens; limit->tokens_max = limit->tokens; if (tb[NFTA_LIMIT_FLAGS]) { @@ -144,7 +154,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, struct nft_limit_pkts *priv = nft_expr_priv(expr); int err; - err = nft_limit_init(&priv->limit, tb); + err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; @@ -185,7 +195,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, { struct nft_limit *priv = nft_expr_priv(expr); - return nft_limit_init(priv, tb); + return nft_limit_init(priv, tb, false); } static int nft_limit_bytes_dump(struct sk_buff *skb, @@ -246,7 +256,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, struct nft_limit_pkts *priv = nft_obj_data(obj); int err; - err = nft_limit_init(&priv->limit, tb); + err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; @@ -289,7 +299,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, { struct nft_limit *priv = nft_obj_data(obj); - return nft_limit_init(priv, tb); + return nft_limit_init(priv, tb, false); } static int nft_limit_obj_bytes_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index a27be36dc0af..7eef1cffbf1b 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -9,12 +9,15 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include <linux/audit.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <net/ipv6.h> +#include <net/ip.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netdevice.h> @@ -26,12 +29,93 @@ struct nft_log { char *prefix; }; +static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); + if (!ih) + return false; + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", + &ih->saddr, &ih->daddr, ih->protocol); + + return true; +} + +static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + __be16 frag_off; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) + return false; + + nexthdr = ih->nexthdr; + ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + return true; +} + +static void nft_log_eval_audit(const struct nft_pktinfo *pkt) +{ + struct sk_buff *skb = pkt->skb; + struct audit_buffer *ab; + int fam = -1; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (!ab) + return; + + audit_log_format(ab, "mark=%#x", skb->mark); + + switch (nft_pf(pkt)) { + case NFPROTO_BRIDGE: + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; + break; + case htons(ETH_P_IPV6): + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; + break; + } + break; + case NFPROTO_IPV4: + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; + break; + case NFPROTO_IPV6: + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; + break; + } + + if (fam == -1) + audit_log_format(ab, " saddr=? daddr=? proto=-1"); + + audit_log_end(ab); +} + static void nft_log_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); + if (priv->loginfo.type == NF_LOG_TYPE_LOG && + priv->loginfo.u.log.level == LOGLEVEL_AUDIT) { + nft_log_eval_audit(pkt); + return; + } + nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb, nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s", priv->prefix); @@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx, } else { li->u.log.level = LOGLEVEL_WARNING; } - if (li->u.log.level > LOGLEVEL_DEBUG) { + if (li->u.log.level > LOGLEVEL_AUDIT) { err = -EINVAL; goto err1; } @@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } + if (li->u.log.level == LOGLEVEL_AUDIT) + return 0; + err = nf_logger_find_get(ctx->family, li->type); if (err < 0) goto err1; @@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); + if (li->u.log.level == LOGLEVEL_AUDIT) + return; + nf_logger_put(ctx->family, li->type); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f52da5e2199f..42e6fadf1417 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -149,6 +149,52 @@ nla_put_failure: return -1; } +static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_data *data; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + return nft_chain_validate(ctx, data->verdict.chain); + default: + return 0; + } +} + +static int nft_lookup_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **d) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + struct nft_set_iter iter; + + if (!(priv->set->flags & NFT_SET_MAP) || + priv->set->dtype != NFT_DATA_VERDICT) + return 0; + + iter.genmask = nft_genmask_next(ctx->net); + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nft_lookup_validate_setelem; + + priv->set->ops->walk(ctx, priv->set, &iter); + if (iter.err < 0) + return iter.err; + + return 0; +} + static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), @@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = { .init = nft_lookup_init, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, + .validate = nft_lookup_validate, }; struct nft_expr_type nft_lookup_type __read_mostly = { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 5348bd058c88..1105a23bda5e 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -259,7 +259,7 @@ static void nft_meta_set_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; u32 *sreg = ®s->data[meta->sreg]; u32 value = *sreg; - u8 pkt_type; + u8 value8; switch (meta->key) { case NFT_META_MARK: @@ -269,15 +269,17 @@ static void nft_meta_set_eval(const struct nft_expr *expr, skb->priority = value; break; case NFT_META_PKTTYPE: - pkt_type = nft_reg_load8(sreg); + value8 = nft_reg_load8(sreg); - if (skb->pkt_type != pkt_type && - skb_pkt_type_ok(pkt_type) && + if (skb->pkt_type != value8 && + skb_pkt_type_ok(value8) && skb_pkt_type_ok(skb->pkt_type)) - skb->pkt_type = pkt_type; + skb->pkt_type = value8; break; case NFT_META_NFTRACE: - skb->nf_trace = !!value; + value8 = nft_reg_load8(sreg); + + skb->nf_trace = !!value8; break; default: WARN_ON(1); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index cdbc62a53933..1f4d0854cf70 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -114,10 +114,7 @@ static int nft_ng_inc_map_init(const struct nft_ctx *ctx, tb[NFTA_NG_SET_NAME], tb[NFTA_NG_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index dbf1f4ad077c..6f9a1365a09f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work) continue; } + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { + struct nft_expr *expr = nft_set_ext_expr(&he->ext); + + if (expr->ops->gc && + expr->ops->gc(read_pnet(&set->net), expr)) + goto gc; + } if (!nft_set_elem_expired(&he->ext)) continue; +gc: if (nft_set_elem_mark_busy(&he->ext)) continue; @@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[], return sizeof(struct nft_rhash); } +static void nft_rhash_gc_init(const struct nft_set *set) +{ + struct nft_rhash *priv = nft_set_priv(set); + + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); +} + static int nft_rhash_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const tb[]) @@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set, INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + nft_rhash_gc_init(set); + return 0; } @@ -647,6 +663,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = { .elemsize = offsetof(struct nft_rhash_elem, ext), .estimate = nft_rhash_estimate, .init = nft_rhash_init, + .gc_init = nft_rhash_gc_init, .destroy = nft_rhash_destroy, .insert = nft_rhash_insert, .activate = nft_rhash_activate, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c new file mode 100644 index 000000000000..f28a0b944087 --- /dev/null +++ b/net/netfilter/nft_socket.c @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_socket.h> +#include <net/inet_sock.h> +#include <net/tcp.h> + +struct nft_socket { + enum nft_socket_keys key:8; + union { + enum nft_registers dreg:8; + }; +}; + +static void nft_socket_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_socket *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + struct sock *sk = skb->sk; + u32 *dest = ®s->data[priv->dreg]; + + if (!sk) + switch(nft_pf(pkt)) { + case NFPROTO_IPV4: + sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); + break; +#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) + case NFPROTO_IPV6: + sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + return; + } + + if(!sk) { + nft_reg_store8(dest, 0); + return; + } + + /* So that subsequent socket matching not to require other lookups. */ + skb->sk = sk; + + switch(priv->key) { + case NFT_SOCKET_TRANSPARENT: + nft_reg_store8(dest, inet_sk_transparent(sk)); + break; + default: + WARN_ON(1); + regs->verdict.code = NFT_BREAK; + } +} + +static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { + [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, +}; + +static int nft_socket_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_socket *priv = nft_expr_priv(expr); + unsigned int len; + + if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) + return -EINVAL; + + switch(ctx->family) { + case NFPROTO_IPV4: +#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) + case NFPROTO_IPV6: +#endif + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); + switch(priv->key) { + case NFT_SOCKET_TRANSPARENT: + len = sizeof(u8); + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static int nft_socket_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_socket *priv = nft_expr_priv(expr); + + if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key))) + return -1; + if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) + return -1; + return 0; +} + +static struct nft_expr_type nft_socket_type; +static const struct nft_expr_ops nft_socket_ops = { + .type = &nft_socket_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)), + .eval = nft_socket_eval, + .init = nft_socket_init, + .dump = nft_socket_dump, +}; + +static struct nft_expr_type nft_socket_type __read_mostly = { + .name = "socket", + .ops = &nft_socket_ops, + .policy = nft_socket_policy, + .maxattr = NFTA_SOCKET_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_socket_module_init(void) +{ + return nft_register_expr(&nft_socket_type); +} + +static void __exit nft_socket_module_exit(void) +{ + nft_unregister_expr(&nft_socket_type); +} + +module_init(nft_socket_module_init); +module_exit(nft_socket_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Máté Eckl"); +MODULE_DESCRIPTION("nf_tables socket match module"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 8c89323c06af..58fce4e749a9 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -33,264 +33,9 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif +#include <net/netfilter/nf_tproxy.h> #include <linux/netfilter/xt_TPROXY.h> -enum nf_tproxy_lookup_t { - NFT_LOOKUP_LISTENER, - NFT_LOOKUP_ESTABLISHED, -}; - -static bool tproxy_sk_is_transparent(struct sock *sk) -{ - switch (sk->sk_state) { - case TCP_TIME_WAIT: - if (inet_twsk(sk)->tw_transparent) - return true; - break; - case TCP_NEW_SYN_RECV: - if (inet_rsk(inet_reqsk(sk))->no_srccheck) - return true; - break; - default: - if (inet_sk(sk)->transparent) - return true; - } - - sock_gen_put(sk); - return false; -} - -static inline __be32 -tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) -{ - struct in_device *indev; - __be32 laddr; - - if (user_laddr) - return user_laddr; - - laddr = 0; - indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { - laddr = ifa->ifa_local; - break; - } endfor_ifa(indev); - - return laddr ? laddr : daddr; -} - -/* - * This is used when the user wants to intercept a connection matching - * an explicit iptables rule. In this case the sockets are assumed - * matching in preference order: - * - * - match: if there's a fully established connection matching the - * _packet_ tuple, it is returned, assuming the redirection - * already took place and we process a packet belonging to an - * established connection - * - * - match: if there's a listening socket matching the redirection - * (e.g. on-port & on-ip of the connection), it is returned, - * regardless if it was bound to 0.0.0.0 or an explicit - * address. The reasoning is that if there's an explicit rule, it - * does not really matter if the listener is bound to an interface - * or to 0. The user already stated that he wants redirection - * (since he added the rule). - * - * Please note that there's an overlap between what a TPROXY target - * and a socket match will match. Normally if you have both rules the - * "socket" match will be the first one, effectively all packets - * belonging to established connections going through that one. - */ -static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, - const u8 protocol, - const __be32 saddr, const __be32 daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, - const enum nf_tproxy_lookup_t lookup_type) -{ - struct sock *sk; - struct tcphdr *tcph; - - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_LISTENER: - tcph = hp; - sk = inet_lookup_listener(net, &tcp_hashinfo, skb, - ip_hdrlen(skb) + - __tcp_hdrlen(tcph), - saddr, sport, - daddr, dport, - in->ifindex, 0); - - if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) - sk = NULL; - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too - */ - break; - case NFT_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - default: - BUG(); - } - break; - case IPPROTO_UDP: - sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too - */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); - - return sk; -} - -#ifdef XT_TPROXY_HAVE_IPV6 -static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, - const u8 protocol, - const struct in6_addr *saddr, const struct in6_addr *daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, - const enum nf_tproxy_lookup_t lookup_type) -{ - struct sock *sk; - struct tcphdr *tcph; - - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_LISTENER: - tcph = hp; - sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, - thoff + __tcp_hdrlen(tcph), - saddr, sport, - daddr, ntohs(dport), - in->ifindex, 0); - - if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) - sk = NULL; - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too - */ - break; - case NFT_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, ntohs(dport), - in->ifindex, 0); - break; - default: - BUG(); - } - break; - case IPPROTO_UDP: - sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too - */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", - protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); - - return sk; -} -#endif - -/** - * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections - * @skb: The skb being processed. - * @laddr: IPv4 address to redirect to or zero. - * @lport: TCP port to redirect to or zero. - * @sk: The TIME_WAIT TCP socket found by the lookup. - * - * We have to handle SYN packets arriving to TIME_WAIT sockets - * differently: instead of reopening the connection we should rather - * redirect the new connection to the proxy if there's a listener - * socket present. - * - * tproxy_handle_time_wait4() consumes the socket reference passed in. - * - * Returns the listener socket if there's one, the TIME_WAIT socket if - * no such listener is found, or NULL if the TCP header is incomplete. - */ -static struct sock * -tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, - __be32 laddr, __be16 lport, struct sock *sk) -{ - const struct iphdr *iph = ip_hdr(skb); - struct tcphdr _hdr, *hp; - - hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); - if (hp == NULL) { - inet_twsk_put(inet_twsk(sk)); - return NULL; - } - - if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { - /* SYN to a TIME_WAIT socket, we'd rather redirect it - * to a listener socket if there's one */ - struct sock *sk2; - - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, - iph->saddr, laddr ? laddr : iph->daddr, - hp->source, lport ? lport : hp->dest, - skb->dev, NFT_LOOKUP_LISTENER); - if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); - sk = sk2; - } - } - - return sk; -} - /* assign a socket to the skb -- consumes sk */ static void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) @@ -319,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, - skb->dev, NFT_LOOKUP_ESTABLISHED); + skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); - laddr = tproxy_laddr4(skb, laddr, iph->daddr); + laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr); if (!lport) lport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ - sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk); + sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk); else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, iph->saddr, laddr, hp->source, lport, - skb->dev, NFT_LOOKUP_LISTENER); + skb->dev, NF_TPROXY_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ - if (sk && tproxy_sk_is_transparent(sk)) { + if (sk && nf_tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~mark_mask) ^ mark_value; @@ -377,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) #ifdef XT_TPROXY_HAVE_IPV6 -static inline const struct in6_addr * -tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, - const struct in6_addr *daddr) -{ - struct inet6_dev *indev; - struct inet6_ifaddr *ifa; - struct in6_addr *laddr; - - if (!ipv6_addr_any(user_laddr)) - return user_laddr; - laddr = NULL; - - indev = __in6_dev_get(skb->dev); - if (indev) { - read_lock_bh(&indev->lock); - list_for_each_entry(ifa, &indev->addr_list, if_list) { - if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) - continue; - - laddr = &ifa->addr; - break; - } - read_unlock_bh(&indev->lock); - } - - return laddr ? laddr : daddr; -} - -/** - * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections - * @skb: The skb being processed. - * @tproto: Transport protocol. - * @thoff: Transport protocol header offset. - * @par: Iptables target parameters. - * @sk: The TIME_WAIT TCP socket found by the lookup. - * - * We have to handle SYN packets arriving to TIME_WAIT sockets - * differently: instead of reopening the connection we should rather - * redirect the new connection to the proxy if there's a listener - * socket present. - * - * tproxy_handle_time_wait6() consumes the socket reference passed in. - * - * Returns the listener socket if there's one, the TIME_WAIT socket if - * no such listener is found, or NULL if the TCP header is incomplete. - */ -static struct sock * -tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, - const struct xt_action_param *par, - struct sock *sk) -{ - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct tcphdr _hdr, *hp; - const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; - - hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); - if (hp == NULL) { - inet_twsk_put(inet_twsk(sk)); - return NULL; - } - - if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { - /* SYN to a TIME_WAIT socket, we'd rather redirect it - * to a listener socket if there's one */ - struct sock *sk2; - - sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, - &iph->saddr, - tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), - hp->source, - tgi->lport ? tgi->lport : hp->dest, - skb->dev, NFT_LOOKUP_LISTENER); - if (sk2) { - inet_twsk_deschedule_put(inet_twsk(sk)); - sk = sk2; - } - } - - return sk; -} - static unsigned int tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { @@ -489,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, - xt_in(par), NFT_LOOKUP_ESTABLISHED); + xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); - laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); + laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); lport = tgi->lport ? tgi->lport : hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ - if (sk && sk->sk_state == TCP_TIME_WAIT) + if (sk && sk->sk_state == TCP_TIME_WAIT) { + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; /* reopening a TIME_WAIT connection needs special handling */ - sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); + sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff, + xt_net(par), + &tgi->laddr.in6, + tgi->lport, + sk); + } else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, &iph->saddr, laddr, hp->source, lport, - xt_in(par), NFT_LOOKUP_LISTENER); + xt_in(par), NF_TPROXY_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ - if (sk && tproxy_sk_is_transparent(sk)) { + if (sk && nf_tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2ac7f674d19b..5c0779c4fa3c 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, * if XT_SOCKET_TRANSPARENT is used */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = nf_sk_is_transparent(sk); + transparent = inet_sk_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) @@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) * if XT_SOCKET_TRANSPARENT is used */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = nf_sk_is_transparent(sk); + transparent = inet_sk_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f018eafc2a0d..376181cc1def 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -206,7 +206,6 @@ int nfc_genl_targets_found(struct nfc_dev *dev) return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -237,7 +236,6 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -269,7 +267,6 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -299,7 +296,6 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -340,7 +336,6 @@ int nfc_genl_device_added(struct nfc_dev *dev) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -370,7 +365,6 @@ int nfc_genl_device_removed(struct nfc_dev *dev) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -434,8 +428,6 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: - genlmsg_cancel(msg, hdr); - free_msg: nlmsg_free(msg); @@ -470,7 +462,6 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -501,7 +492,6 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -546,7 +536,6 @@ int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); @@ -585,7 +574,6 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -703,7 +691,6 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -735,7 +722,6 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev) return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -1030,7 +1016,6 @@ static int nfc_genl_send_params(struct sk_buff *msg, return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); return -EMSGSIZE; } @@ -1290,7 +1275,6 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, return 0; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); return -EMSGSIZE; @@ -1507,7 +1491,6 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) return; nla_put_failure: - genlmsg_cancel(msg, hdr); free_msg: nlmsg_free(msg); kfree(ctx); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b00aa959727d..082f981795f5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4250,7 +4250,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; if (po->tp_version >= TPACKET_V3 && req->tp_block_size <= - BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv)) + BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr)) goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 1a31502ee7db..bffde4b46c5d 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -8,7 +8,7 @@ config RDS config RDS_RDMA tristate "RDS over Infiniband" - depends on RDS && INFINIBAND_ADDR_TRANS + depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS ---help--- Allow RDS to use Infiniband as a transport. This transport supports RDMA operations. diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 19975d2ca9a2..b2393113a251 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -477,6 +477,7 @@ enum rxrpc_call_flag { RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ + RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ }; /* @@ -624,6 +625,7 @@ struct rxrpc_call { */ rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ + rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ bool tx_phase; /* T if transmission phase, F if receive phase */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6e0d788b4dc4..20210418904b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -392,7 +392,13 @@ recheck_state: /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { - rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); + if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && + (int)call->conn->hi_serial - (int)call->rx_serial > 0) { + trace_rxrpc_call_reset(call); + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET); + } else { + rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME); + } set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b5fd6381313d..608d078a4981 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk) call = NULL; } - if (call && sp->hdr.serviceId != call->service_id) - call->service_id = sp->hdr.serviceId; + if (call) { + if (sp->hdr.serviceId != call->service_id) + call->service_id = sp->hdr.serviceId; + if ((int)sp->hdr.serial - (int)call->rx_serial > 0) + call->rx_serial = sp->hdr.serial; + if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) + set_bit(RXRPC_CALL_RX_HEARD, &call->flags); + } } else { skew = 0; call = NULL; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 76303c45db19..29fb4d68a144 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -437,6 +437,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) return idr_find(&tn->idr, block_index); } +/* Find tcf block. + * Set q, parent, cl when appropriate. + */ + +static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, + u32 *parent, unsigned long *cl, + int ifindex, u32 block_index, + struct netlink_ext_ack *extack) +{ + struct tcf_block *block; + + if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, block_index); + if (!block) { + NL_SET_ERR_MSG(extack, "Block of given index was not found"); + return ERR_PTR(-EINVAL); + } + } else { + const struct Qdisc_class_ops *cops; + struct net_device *dev; + + /* Find link */ + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return ERR_PTR(-ENODEV); + + /* Find qdisc */ + if (!*parent) { + *q = dev->qdisc; + *parent = (*q)->handle; + } else { + *q = qdisc_lookup(dev, TC_H_MAJ(*parent)); + if (!*q) { + NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + return ERR_PTR(-EINVAL); + } + } + + /* Is it classful? */ + cops = (*q)->ops->cl_ops; + if (!cops) { + NL_SET_ERR_MSG(extack, "Qdisc not classful"); + return ERR_PTR(-EINVAL); + } + + if (!cops->tcf_block) { + NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); + return ERR_PTR(-EOPNOTSUPP); + } + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(*parent)) { + *cl = cops->find(*q, *parent); + if (*cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); + return ERR_PTR(-ENOENT); + } + } + + /* And the last stroke */ + block = cops->tcf_block(*q, *cl, extack); + if (!block) + return ERR_PTR(-EINVAL); + if (tcf_block_shared(block)) { + NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); + return ERR_PTR(-EOPNOTSUPP); + } + } + + return block; +} + static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) { return list_first_entry(&block->chain_list, struct tcf_chain, list); @@ -735,10 +807,6 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, int ok_count = 0; int err; - /* Make sure all netdevs sharing this block are offload-capable. */ - if (block->nooffloaddevcnt && err_stop) - return -EOPNOTSUPP; - list_for_each_entry(block_cb, &block->cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { @@ -984,9 +1052,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, q, parent, 0, event, false); } -/* Add/change/delete/get a filter node */ - -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, +static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); @@ -1007,8 +1073,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, int err; int tp_created; - if ((n->nlmsg_type != RTM_GETTFILTER) && - !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; replay: @@ -1026,24 +1091,13 @@ replay: cl = 0; if (prio == 0) { - switch (n->nlmsg_type) { - case RTM_DELTFILTER: - if (protocol || t->tcm_handle || tca[TCA_KIND]) { - NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); - return -ENOENT; - } - break; - case RTM_NEWTFILTER: - /* If no priority is provided by the user, - * we allocate one. - */ - if (n->nlmsg_flags & NLM_F_CREATE) { - prio = TC_H_MAKE(0x80000000U, 0U); - prio_allocate = true; - break; - } - /* fall-through */ - default: + /* If no priority is provided by the user, + * we allocate one. + */ + if (n->nlmsg_flags & NLM_F_CREATE) { + prio = TC_H_MAKE(0x80000000U, 0U); + prio_allocate = true; + } else { NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } @@ -1051,66 +1105,11 @@ replay: /* Find head of filter chain. */ - if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { - block = tcf_block_lookup(net, t->tcm_block_index); - if (!block) { - NL_SET_ERR_MSG(extack, "Block of given index was not found"); - err = -EINVAL; - goto errout; - } - } else { - const struct Qdisc_class_ops *cops; - struct net_device *dev; - - /* Find link */ - dev = __dev_get_by_index(net, t->tcm_ifindex); - if (!dev) - return -ENODEV; - - /* Find qdisc */ - if (!parent) { - q = dev->qdisc; - parent = q->handle; - } else { - q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); - if (!q) { - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); - return -EINVAL; - } - } - - /* Is it classful? */ - cops = q->ops->cl_ops; - if (!cops) { - NL_SET_ERR_MSG(extack, "Qdisc not classful"); - return -EINVAL; - } - - if (!cops->tcf_block) { - NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); - return -EOPNOTSUPP; - } - - /* Do we search for filter, attached to class? */ - if (TC_H_MIN(parent)) { - cl = cops->find(q, parent); - if (cl == 0) { - NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); - return -ENOENT; - } - } - - /* And the last stroke */ - block = cops->tcf_block(q, cl, extack); - if (!block) { - err = -EINVAL; - goto errout; - } - if (tcf_block_shared(block)) { - NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); - err = -EOPNOTSUPP; - goto errout; - } + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) { + err = PTR_ERR(block); + goto errout; } chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; @@ -1119,19 +1118,10 @@ replay: err = -EINVAL; goto errout; } - chain = tcf_chain_get(block, chain_index, - n->nlmsg_type == RTM_NEWTFILTER); + chain = tcf_chain_get(block, chain_index, true); if (!chain) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); - err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL; - goto errout; - } - - if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); - tcf_chain_flush(chain); - err = 0; + err = -ENOMEM; goto errout; } @@ -1152,8 +1142,7 @@ replay: goto errout; } - if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) { + if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; @@ -1178,56 +1167,15 @@ replay: fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { - if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false); - tcf_proto_destroy(tp, extack); - err = 0; - goto errout; - } - - if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) { + if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } - } else { - bool last; - - switch (n->nlmsg_type) { - case RTM_NEWTFILTER: - if (n->nlmsg_flags & NLM_F_EXCL) { - if (tp_created) - tcf_proto_destroy(tp, NULL); - NL_SET_ERR_MSG(extack, "Filter already exists"); - err = -EEXIST; - goto errout; - } - break; - case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, block, - q, parent, fh, false, &last, - extack); - if (err) - goto errout; - if (last) { - tcf_chain_tp_remove(chain, &chain_info, tp); - tcf_proto_destroy(tp, extack); - } - goto errout; - case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true); - if (err < 0) - NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); - goto errout; - default: - NL_SET_ERR_MSG(extack, "Invalid netlink message type"); - err = -EINVAL; - goto errout; - } + } else if (n->nlmsg_flags & NLM_F_EXCL) { + NL_SET_ERR_MSG(extack, "Filter already exists"); + err = -EEXIST; + goto errout; } err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, @@ -1252,6 +1200,202 @@ errout: return err; } +static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 protocol; + u32 prio; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain_info chain_info; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + struct tcf_proto *tp = NULL; + unsigned long cl = 0; + void *fh = NULL; + int err; + + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + protocol = TC_H_MIN(t->tcm_info); + prio = TC_H_MAJ(t->tcm_info); + parent = t->tcm_parent; + + if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) { + NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); + return -ENOENT; + } + + /* Find head of filter chain. */ + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) { + err = PTR_ERR(block); + goto errout; + } + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + err = -EINVAL; + goto errout; + } + chain = tcf_chain_get(block, chain_index, false); + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + err = -EINVAL; + goto errout; + } + + if (prio == 0) { + tfilter_notify_chain(net, skb, block, q, parent, n, + chain, RTM_DELTFILTER); + tcf_chain_flush(chain); + err = 0; + goto errout; + } + + tp = tcf_chain_tp_find(chain, &chain_info, protocol, + prio, false); + if (!tp || IS_ERR(tp)) { + NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); + err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout; + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); + err = -EINVAL; + goto errout; + } + + fh = tp->ops->get(tp, t->tcm_handle); + + if (!fh) { + if (t->tcm_handle == 0) { + tcf_chain_tp_remove(chain, &chain_info, tp); + tfilter_notify(net, skb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, false); + tcf_proto_destroy(tp, extack); + err = 0; + } else { + NL_SET_ERR_MSG(extack, "Specified filter handle not found"); + err = -ENOENT; + } + } else { + bool last; + + err = tfilter_del_notify(net, skb, n, tp, block, + q, parent, fh, false, &last, + extack); + if (err) + goto errout; + if (last) { + tcf_chain_tp_remove(chain, &chain_info, tp); + tcf_proto_destroy(tp, extack); + } + } + +errout: + if (chain) + tcf_chain_put(chain); + return err; +} + +static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 protocol; + u32 prio; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain_info chain_info; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + struct tcf_proto *tp = NULL; + unsigned long cl = 0; + void *fh = NULL; + int err; + + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + protocol = TC_H_MIN(t->tcm_info); + prio = TC_H_MAJ(t->tcm_info); + parent = t->tcm_parent; + + if (prio == 0) { + NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); + return -ENOENT; + } + + /* Find head of filter chain. */ + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) { + err = PTR_ERR(block); + goto errout; + } + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + err = -EINVAL; + goto errout; + } + chain = tcf_chain_get(block, chain_index, false); + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + err = -EINVAL; + goto errout; + } + + tp = tcf_chain_tp_find(chain, &chain_info, protocol, + prio, false); + if (!tp || IS_ERR(tp)) { + NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); + err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout; + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); + err = -EINVAL; + goto errout; + } + + fh = tp->ops->get(tp, t->tcm_handle); + + if (!fh) { + NL_SET_ERR_MSG(extack, "Specified filter handle not found"); + err = -ENOENT; + } else { + err = tfilter_notify(net, skb, n, tp, block, q, parent, + fh, RTM_NEWTFILTER, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); + } + +errout: + if (chain) + tcf_chain_put(chain); + return err; +} + struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; @@ -1581,21 +1725,31 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, enum tc_setup_type type, void *type_data, bool err_stop) { - int ok_count; + int ok_count = 0; int ret; - ret = tcf_block_cb_call(block, type, type_data, err_stop); - if (ret < 0) - return ret; - ok_count = ret; + if (!block->nooffloaddevcnt) { + ret = tcf_block_cb_call(block, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count = ret; + } if (!exts || ok_count) - return ok_count; + goto skip_egress; + ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); if (ret < 0) return ret; ok_count += ret; +skip_egress: + /* if one of the netdevs sharing this block are not offload-capable + * make sure we succeeded in egress instead. + */ + if (block->nooffloaddevcnt && !ok_count && err_stop) + return -EOPNOTSUPP; + return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); @@ -1634,9 +1788,9 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, tc_dump_tfilter, 0); return 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 4e74508515f4..2b5be42a9f1c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,6 +326,8 @@ static void fl_destroy_sleepable(struct work_struct *work) struct cls_fl_head *head = container_of(to_rcu_work(work), struct cls_fl_head, rwork); + + rhashtable_destroy(&head->ht); kfree(head); module_put(THIS_MODULE); } @@ -875,7 +877,7 @@ static int fl_check_assign_mask(struct cls_fl_head *head, return PTR_ERR(newmask); fnew->mask = newmask; - } else if (fold && fold->mask == fnew->mask) { + } else if (fold && fold->mask != fnew->mask) { return -EINVAL; } @@ -1028,7 +1030,7 @@ errout_mask: fl_mask_put(head, fnew->mask, false); errout_idr: - if (fnew->handle) + if (!fold) idr_remove(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 760ab1b09f8b..69078c82963e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, return false; } - if (ret && netif_xmit_frozen_or_stopped(txq)) - return false; - return true; } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f062a18e9162..d6b8ae4ed7a3 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -16,6 +16,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> +#include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> @@ -23,12 +24,44 @@ struct mq_sched { struct Qdisc **qdiscs; }; +static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_mq_qopt_offload opt = { + .command = cmd, + .handle = sch->handle, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); +} + +static void mq_offload_stats(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_mq_qopt_offload opt = { + .command = TC_MQ_STATS, + .handle = sch->handle, + .stats = { + .bstats = &sch->bstats, + .qstats = &sch->qstats, + }, + }; + + if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); +} + static void mq_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; + mq_offload(sch, TC_MQ_DESTROY); + if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) @@ -70,6 +103,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt, } sch->flags |= TCQ_F_MQROOT; + + mq_offload(sch, TC_MQ_CREATE); return 0; } @@ -127,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -135,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } + mq_offload_stats(sch); return 0; } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4a95e260b674..445b7ef61677 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -637,7 +637,7 @@ unsigned long sctp_transport_timeout(struct sctp_transport *trans) trans->state != SCTP_PF) timeout += trans->hbinterval; - return timeout; + return max_t(unsigned long, timeout, HZ / 5); } /* Reset transport variables to their initial values */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2c369d4bb1c1..973b4471b532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1420,7 +1420,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, return rc; if (optlen < sizeof(int)) - return rc; + return -EINVAL; get_user(val, (int __user *)optval); lock_sock(sk); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 6358e5271070..ac09ca803296 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -50,7 +50,7 @@ config SUNRPC_DEBUG config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" - depends on SUNRPC && INFINIBAND_ADDR_TRANS + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND select SG_POOL help diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 40b54cc64243..5f48251c1319 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1658,7 +1658,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } -out: return &xdst0->u.dst; put_states: @@ -1667,8 +1666,8 @@ put_states: free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); - xdst0 = ERR_PTR(err); - goto out; + + return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, |