aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c4
-rw-r--r--net/core/dev.c248
-rw-r--r--net/core/devlink.c212
-rw-r--r--net/core/drop_monitor.c11
-rw-r--r--net/core/dst.c8
-rw-r--r--net/core/filter.c470
-rw-r--r--net/core/flow_dissector.c155
-rw-r--r--net/core/flow_offload.c305
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/link_watch.c12
-rw-r--r--net/core/neighbour.c40
-rw-r--r--net/core/net-sysfs.c47
-rw-r--r--net/core/net_namespace.c5
-rw-r--r--net/core/netclassid_cgroup.c4
-rw-r--r--net/core/netpoll.c38
-rw-r--r--net/core/netprio_cgroup.c2
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/rtnetlink.c11
-rw-r--r--net/core/scm.c141
-rw-r--r--net/core/secure_seq.c1
-rw-r--r--net/core/skbuff.c12
-rw-r--r--net/core/skmsg.c98
-rw-r--r--net/core/sock.c222
-rw-r--r--net/core/sock_map.c18
-rw-r--r--net/core/sysctl_net_core.c33
-rw-r--r--net/core/xdp.c59
26 files changed, 1343 insertions, 817 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 756b63b6f7b3..d2c4d16dadba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
if (attr->value_size > MAX_VALUE_SIZE)
@@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
* the map_alloc_check() side also does.
*/
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
nla_for_each_nested(nla, nla_stgs, rem) {
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c9e763bfe0e..061496a1f640 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -79,6 +79,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -194,7 +195,7 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static seqcount_t devnet_rename_seq;
+static DECLARE_RWSEM(devnet_rename_sem);
static inline void dev_base_seq_inc(struct net *net)
{
@@ -398,6 +399,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+#endif
+
/*******************************************************************************
*
* Protocol management and registration routines
@@ -930,33 +999,28 @@ EXPORT_SYMBOL(dev_get_by_napi_id);
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
- *
- * The use of raw_seqcount_begin() and cond_resched() before
- * retrying is required as we want to give the writers a chance
- * to complete when CONFIG_PREEMPTION is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
- unsigned int seq;
+ int ret;
-retry:
- seq = raw_seqcount_begin(&devnet_rename_seq);
+ down_read(&devnet_rename_sem);
rcu_read_lock();
+
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
+ ret = -ENODEV;
+ goto out;
}
strcpy(name, dev->name);
- rcu_read_unlock();
- if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
- goto retry;
- }
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ up_read(&devnet_rename_sem);
+ return ret;
}
/**
@@ -1228,10 +1292,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return 0;
}
@@ -1239,7 +1303,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return err;
}
@@ -1254,11 +1318,11 @@ rollback:
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return ret;
}
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
netdev_adjacent_rename_links(dev, oldname);
@@ -1279,7 +1343,7 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
@@ -4140,7 +4204,8 @@ EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
-unsigned int __read_mostly netdev_budget_usecs = 2000;
+/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
@@ -4548,6 +4613,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp->data_meta = xdp->data;
xdp->data_end = xdp->data + hlen;
xdp->data_hard_start = skb->data - skb_headroom(skb);
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
+ xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
@@ -4571,14 +4641,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
skb_reset_network_header(skb);
}
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
- * pckt.
- */
- off = orig_data_end - xdp->data_end;
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
- skb->len -= off;
-
+ skb->len += off; /* positive on grow, negative on shrink */
}
/* check if XDP changed eth hdr such SKB needs update */
@@ -4987,11 +5054,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
@@ -5022,8 +5090,10 @@ another_round:
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
preempt_enable();
- if (ret2 != XDP_PASS)
- return NET_RX_DROP;
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
+ }
skb_reset_mac_len(skb);
}
@@ -5173,6 +5243,13 @@ drop:
}
out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
return ret;
}
@@ -5182,7 +5259,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *pt_prev = NULL;
int ret;
- ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
@@ -5260,7 +5337,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_prev = NULL;
skb_list_del_init(skb);
- __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
@@ -5339,6 +5416,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
struct bpf_prog *new = xdp->prog;
int ret = 0;
+ if (new) {
+ u32 i;
+
+ /* generic XDP does not work with DEVMAPs that can
+ * have a bpf_prog installed on an entry
+ */
+ for (i = 0; i < new->aux->used_map_cnt; i++) {
+ if (dev_map_can_have_prog(new->aux->used_maps[i]))
+ return -EINVAL;
+ }
+ }
+
switch (xdp->command) {
case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);
@@ -6226,7 +6315,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags, val, new;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -6238,20 +6328,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
+ if (work_done) {
+ if (n->gro_bitmask)
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ if (timeout)
+ ret = false;
+ }
if (n->gro_bitmask) {
- unsigned long timeout = 0;
-
- if (work_done)
- timeout = n->dev->gro_flush_timeout;
-
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
- if (timeout)
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
- HRTIMER_MODE_REL_PINNED);
}
gro_normal_list(n);
@@ -6283,7 +6376,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
}
- return true;
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -6463,7 +6559,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
+ if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
@@ -7785,6 +7881,28 @@ void netdev_bonding_info_change(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_bonding_info_change);
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+ struct sk_buff *skb,
+ bool all_slaves)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_xmit_slave)
+ return NULL;
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
+
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
@@ -8666,8 +8784,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
const struct net_device_ops *ops = dev->netdev_ops;
enum bpf_netdev_command query;
u32 prog_id, expected_id = 0;
- struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
+ struct bpf_prog *prog;
bool offload;
int err;
@@ -8725,6 +8843,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return -EINVAL;
}
+ if (prog->expected_attach_type == BPF_XDP_DEVMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
/* prog->aux->id may be 0 for orphaned device-bound progs */
if (prog->aux->id && prog->aux->id == prog_id) {
bpf_prog_put(prog);
@@ -8733,6 +8857,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
} else {
if (!prog_id)
return 0;
+ prog = NULL;
}
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
@@ -8905,11 +9030,13 @@ static void netdev_sync_lower_features(struct net_device *upper,
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
lower->wanted_features &= ~feature;
- netdev_update_features(lower);
+ __netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
+ else
+ netdev_features_change(lower);
}
}
}
@@ -9134,6 +9261,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
else
netif_dormant_off(dev);
+ if (rootdev->operstate == IF_OPER_TESTING)
+ netif_testing_on(dev);
+ else
+ netif_testing_off(dev);
+
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev);
else
@@ -9194,7 +9326,7 @@ static void netdev_init_one_queue(struct net_device *dev,
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
- lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
@@ -9241,22 +9373,6 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
-static void netdev_register_lockdep_key(struct net_device *dev)
-{
- lockdep_register_key(&dev->qdisc_tx_busylock_key);
- lockdep_register_key(&dev->qdisc_running_key);
- lockdep_register_key(&dev->qdisc_xmit_lock_key);
- lockdep_register_key(&dev->addr_list_lock_key);
-}
-
-static void netdev_unregister_lockdep_key(struct net_device *dev)
-{
- lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
- lockdep_unregister_key(&dev->qdisc_running_key);
- lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
- lockdep_unregister_key(&dev->addr_list_lock_key);
-}
-
void netdev_update_lockdep_key(struct net_device *dev)
{
lockdep_unregister_key(&dev->addr_list_lock_key);
@@ -9823,7 +9939,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
- netdev_register_lockdep_key(dev);
+ lockdep_register_key(&dev->addr_list_lock_key);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
@@ -9912,7 +10028,7 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
- netdev_unregister_lockdep_key(dev);
+ lockdep_unregister_key(&dev->addr_list_lock_key);
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 80f97722f31f..2cafbc808b09 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3716,24 +3716,26 @@ nla_put_failure:
return err;
}
-static void devlink_nl_region_notify(struct devlink_region *region,
- struct devlink_snapshot *snapshot,
- enum devlink_command cmd)
+static struct sk_buff *
+devlink_nl_region_notify_build(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd, u32 portid, u32 seq)
{
struct devlink *devlink = region->devlink;
struct sk_buff *msg;
void *hdr;
int err;
- WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- return;
+ return ERR_PTR(-ENOMEM);
- hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
- if (!hdr)
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
goto out_free_msg;
+ }
err = devlink_nl_put_handle(msg, devlink);
if (err)
@@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region,
}
genlmsg_end(msg, hdr);
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-
- return;
+ return msg;
out_cancel_msg:
genlmsg_cancel(msg, hdr);
out_free_msg:
nlmsg_free(msg);
+ return ERR_PTR(err);
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
+ if (IS_ERR(msg))
+ return;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
/**
@@ -4069,6 +4086,8 @@ static int
devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshot_id_attr;
struct devlink_region *region;
const char *region_name;
u32 snapshot_id;
@@ -4080,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
- NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided");
- return -EINVAL;
- }
-
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
region = devlink_region_get_by_name(devlink, region_name);
if (!region) {
@@ -4102,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
return -ENOSPC;
}
- snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+ snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (snapshot_id_attr) {
+ snapshot_id = nla_get_u32(snapshot_id_attr);
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
- return -EEXIST;
- }
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+ return -EEXIST;
+ }
- err = __devlink_snapshot_id_insert(devlink, snapshot_id);
- if (err)
- return err;
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ return err;
+ } else {
+ err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id");
+ return err;
+ }
+ }
err = region->ops->snapshot(devlink, info->extack, &data);
if (err)
@@ -4121,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_snapshot_create;
+ if (!snapshot_id_attr) {
+ struct sk_buff *msg;
+
+ snapshot = devlink_region_snapshot_get_by_id(region,
+ snapshot_id);
+ if (WARN_ON(!snapshot))
+ return -EINVAL;
+
+ msg = devlink_nl_region_notify_build(region, snapshot,
+ DEVLINK_CMD_REGION_NEW,
+ info->snd_portid,
+ info->snd_seq);
+ err = PTR_ERR_OR_ZERO(msg);
+ if (err)
+ goto err_notify;
+
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_notify;
+ }
+
return 0;
err_snapshot_create:
@@ -4128,6 +4172,10 @@ err_snapshot_create:
err_snapshot_capture:
__devlink_snapshot_id_decrement(devlink, snapshot_id);
return err;
+
+err_notify:
+ devlink_region_snapshot_del(region, snapshot);
+ return err;
}
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
@@ -4167,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
struct nlattr **attrs,
u64 start_offset,
u64 end_offset,
- bool dump,
u64 *new_offset)
{
struct devlink_snapshot *snapshot;
@@ -4182,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
if (!snapshot)
return -EINVAL;
- if (end_offset > region->size || dump)
- end_offset = region->size;
-
while (curr_offset < end_offset) {
u32 data_size;
u8 *data;
@@ -4212,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- u64 ret_offset, start_offset, end_offset = 0;
+ u64 ret_offset, start_offset, end_offset = U64_MAX;
struct nlattr **attrs = info->attrs;
struct devlink_region *region;
struct nlattr *chunks_attr;
const char *region_name;
struct devlink *devlink;
- bool dump = true;
void *hdr;
int err;
@@ -4246,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto out_unlock;
}
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ }
+
+ if (end_offset > region->size)
+ end_offset = region->size;
+
/* return 0 if there is no further data to read */
- if (start_offset >= region->size) {
+ if (start_offset == end_offset) {
err = 0;
goto out_unlock;
}
@@ -4274,22 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto nla_put_failure;
}
- if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
- attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
- if (!start_offset)
- start_offset =
- nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
-
- end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
- end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
- dump = false;
- }
-
err = devlink_nl_region_read_snapshot_fill(skb, devlink,
region, attrs,
start_offset,
- end_offset, dump,
- &ret_offset);
+ end_offset, &ret_offset);
if (err && err != -EMSGSIZE)
goto nla_put_failure;
@@ -5363,6 +5407,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
{
enum devlink_health_reporter_state prev_health_state;
struct devlink *devlink = reporter->devlink;
+ unsigned long recover_ts_threshold;
/* write a log message of the current error */
WARN_ON(!msg);
@@ -5373,10 +5418,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
/* abort if the previous error wasn't recovered */
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->graceful_period);
if (reporter->auto_recover &&
(prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
- jiffies - reporter->last_recovery_ts <
- msecs_to_jiffies(reporter->graceful_period))) {
+ (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold)))) {
trace_devlink_health_recover_aborted(devlink,
reporter->ops->name,
reporter->health_state,
@@ -5822,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info,
val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
switch (val) {
case DEVLINK_TRAP_ACTION_DROP: /* fall-through */
- case DEVLINK_TRAP_ACTION_TRAP:
+ case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */
+ case DEVLINK_TRAP_ACTION_MIRROR:
*p_trap_action = val;
break;
default:
@@ -8447,6 +8495,50 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(STP, CONTROL),
+ DEVLINK_TRAP(LACP, CONTROL),
+ DEVLINK_TRAP(LLDP, CONTROL),
+ DEVLINK_TRAP(IGMP_QUERY, CONTROL),
+ DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
+ DEVLINK_TRAP(MLD_QUERY, CONTROL),
+ DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
+ DEVLINK_TRAP(IPV4_DHCP, CONTROL),
+ DEVLINK_TRAP(IPV6_DHCP, CONTROL),
+ DEVLINK_TRAP(ARP_REQUEST, CONTROL),
+ DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
+ DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV4_BFD, CONTROL),
+ DEVLINK_TRAP(IPV6_BFD, CONTROL),
+ DEVLINK_TRAP(IPV4_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV6_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV4_BGP, CONTROL),
+ DEVLINK_TRAP(IPV6_BGP, CONTROL),
+ DEVLINK_TRAP(IPV4_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV6_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV4_PIM, CONTROL),
+ DEVLINK_TRAP(IPV6_PIM, CONTROL),
+ DEVLINK_TRAP(UC_LB, CONTROL),
+ DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
+ DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(PTP_EVENT, CONTROL),
+ DEVLINK_TRAP(PTP_GENERAL, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -8458,9 +8550,28 @@ static const struct devlink_trap devlink_trap_generic[] = {
static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(L2_DROPS),
DEVLINK_TRAP_GROUP(L3_DROPS),
+ DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
DEVLINK_TRAP_GROUP(BUFFER_DROPS),
DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
DEVLINK_TRAP_GROUP(ACL_DROPS),
+ DEVLINK_TRAP_GROUP(STP),
+ DEVLINK_TRAP_GROUP(LACP),
+ DEVLINK_TRAP_GROUP(LLDP),
+ DEVLINK_TRAP_GROUP(MC_SNOOPING),
+ DEVLINK_TRAP_GROUP(DHCP),
+ DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
+ DEVLINK_TRAP_GROUP(BFD),
+ DEVLINK_TRAP_GROUP(OSPF),
+ DEVLINK_TRAP_GROUP(BGP),
+ DEVLINK_TRAP_GROUP(VRRP),
+ DEVLINK_TRAP_GROUP(PIM),
+ DEVLINK_TRAP_GROUP(UC_LB),
+ DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(IPV6),
+ DEVLINK_TRAP_GROUP(PTP_EVENT),
+ DEVLINK_TRAP_GROUP(PTP_GENERAL),
+ DEVLINK_TRAP_GROUP(ACL_SAMPLE),
+ DEVLINK_TRAP_GROUP(ACL_TRAP),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -8798,6 +8909,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
devlink_trap_stats_update(trap_item->stats, skb->len);
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
+ /* Control packets were not dropped by the device or encountered an
+ * exception during forwarding and therefore should not be reported to
+ * the kernel's drop monitor.
+ */
+ if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
in_devlink_port, fa_cookie);
net_dm_hw_report(skb, &hw_metadata);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 8e33cec9fc4e..2ee7bc4c9e03 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -213,6 +213,7 @@ static void sched_send_work(struct timer_list *t)
static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
@@ -231,11 +232,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
+ point = msg->points;
for (i = 0; i < msg->entries; i++) {
- if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
- msg->points[i].count++;
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
goto out;
}
+ point++;
}
if (msg->entries == dm_hit_limit)
goto out;
@@ -244,8 +247,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
*/
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
- memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
- msg->points[msg->entries].count = 1;
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
msg->entries++;
if (!timer_pending(&data->send_timer)) {
diff --git a/net/core/dst.c b/net/core/dst.c
index 193af526e908..d6b6ced0d451 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
{
struct dst_entry *dst;
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc &&
+ !(flags & DST_NOCOUNT) &&
+ dst_entries_get_fast(ops) > ops->gc_thresh) {
if (ops->gc(ops)) {
- printk_ratelimited(KERN_NOTICE "Route cache is full: "
- "consider increasing sysctl "
- "net.ipv[4|6].route.max_size.\n");
+ pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
return NULL;
}
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 7628b947dbc3..209482a4eaa2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
offset);
}
-BPF_CALL_0(bpf_get_raw_cpu_id)
-{
- return raw_smp_processor_id();
-}
-
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = bpf_get_raw_cpu_id,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
-};
-
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -2026,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
+{
+ /* The interface is to be used in combination with bpf_skb_adjust_room()
+ * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
+ * is passed as flags, for example.
+ */
+ switch (level) {
+ case BPF_CSUM_LEVEL_INC:
+ __skb_incr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_DEC:
+ __skb_decr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_RESET:
+ __skb_reset_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_QUERY:
+ return skb->ip_summed == CHECKSUM_UNNECESSARY ?
+ skb->csum_level : -EACCES;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_csum_level_proto = {
+ .func = bpf_csum_level,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
@@ -2590,8 +2613,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
}
pop = 0;
} else if (pop >= sge->length - a) {
- sge->length = a;
pop -= (sge->length - a);
+ sge->length = a;
}
}
@@ -3124,7 +3147,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
{
int ret;
- if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO)
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
return -EINVAL;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
@@ -3174,7 +3198,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32 off;
int ret;
- if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
@@ -3202,6 +3227,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
bpf_skb_net_grow(skb, off, len_diff_abs, flags);
+ if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
+ __skb_reset_checksum_unnecessary(skb);
bpf_compute_data_pointers(skb);
return ret;
@@ -3422,15 +3449,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
+ void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
- /* only shrinking is allowed for now. */
- if (unlikely(offset >= 0))
+ /* Notice that xdp_data_hard_end have reserved some tailroom */
+ if (unlikely(data_end > data_hard_end))
return -EINVAL;
+ /* ALL drivers MUST init xdp->frame_sz, chicken check below */
+ if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
+ WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
+ return -EINVAL;
+ }
+
if (unlikely(data_end < xdp->data + ETH_HLEN))
return -EINVAL;
+ /* Clear memory area on grow, can contain uninit kernel memory */
+ if (offset > 0)
+ memset(xdp->data_end, 0, offset);
+
xdp->data_end = data_end;
return 0;
@@ -4014,16 +4052,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
};
#ifdef CONFIG_SOCK_CGROUP_DATA
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
+{
+ struct cgroup *cgrp;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgroup_id(cgrp);
+}
+
BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
struct sock *sk = skb_to_full_sk(skb);
- struct cgroup *cgrp;
if (!sk || !sk_fullsock(sk))
return 0;
- cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return cgroup_id(cgrp);
+ return __bpf_sk_cgroup_id(sk);
}
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -4033,16 +4077,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
- ancestor_level)
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+ int ancestor_level)
{
- struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor;
struct cgroup *cgrp;
- if (!sk || !sk_fullsock(sk))
- return 0;
-
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
if (!ancestor)
@@ -4051,6 +4091,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
return cgroup_id(ancestor);
}
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.func = bpf_skb_ancestor_cgroup_id,
.gpl_only = false,
@@ -4058,6 +4109,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+ return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+ .func = bpf_sk_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_SOCKET,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+ .func = bpf_sk_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_SOCKET,
+ .arg2_type = ARG_ANYTHING,
+};
#endif
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -4205,38 +4281,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
- void *, data, u64, size)
-{
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
-
- return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
-}
+#define SOCKOPT_CC_REINIT (1 << 0)
-static const struct bpf_func_proto bpf_event_output_data_proto = {
- .func = bpf_event_output_data,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE_OR_ZERO,
-};
-
-BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen, u32 flags)
{
- struct sock *sk = bpf_sock->sk;
+ char devname[IFNAMSIZ];
+ struct net *net;
+ int ifindex;
int ret = 0;
int val;
if (!sk_fullsock(sk))
return -EINVAL;
+ sock_owned_by_me(sk);
+
if (level == SOL_SOCKET) {
- if (optlen != sizeof(int))
+ if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
return -EINVAL;
val = *((int *)optval);
@@ -4277,6 +4339,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk_dst_reset(sk);
}
break;
+ case SO_BINDTODEVICE:
+ ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ optlen = min_t(long, optlen, IFNAMSIZ - 1);
+ strncpy(devname, optval, optlen);
+ devname[optlen] = 0;
+
+ ifindex = 0;
+ if (devname[0] != '\0') {
+ struct net_device *dev;
+
+ ret = -ENODEV;
+
+ net = sock_net(sk);
+ dev = dev_get_by_name(net, devname);
+ if (!dev)
+ break;
+ ifindex = dev->ifindex;
+ dev_put(dev);
+ }
+ ret = sock_bindtoindex(sk, ifindex, false);
+#endif
+ break;
default:
ret = -EINVAL;
}
@@ -4329,7 +4414,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
- bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
+ bool reinit = flags & SOCKOPT_CC_REINIT;
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
@@ -4376,24 +4461,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
return ret;
}
-static const struct bpf_func_proto bpf_setsockopt_proto = {
- .func = bpf_setsockopt,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
-};
-
-BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
{
- struct sock *sk = bpf_sock->sk;
-
if (!sk_fullsock(sk))
goto err_clear;
+
+ sock_owned_by_me(sk);
+
#ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
struct inet_connection_sock *icsk;
@@ -4459,8 +4534,71 @@ err_clear:
return -EINVAL;
}
-static const struct bpf_func_proto bpf_getsockopt_proto = {
- .func = bpf_getsockopt,
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ u32 flags = 0;
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
+ flags);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+ .func = bpf_sock_addr_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+ .func = bpf_sock_addr_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ u32 flags = 0;
+ if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+ flags |= SOCKOPT_CC_REINIT;
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
+ flags);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+ .func = bpf_sock_ops_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+ .func = bpf_sock_ops_getsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -4500,30 +4638,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
{
#ifdef CONFIG_INET
struct sock *sk = ctx->sk;
+ u32 flags = BIND_FROM_BPF;
int err;
- /* Binding to port can be expensive so it's prohibited in the helper.
- * Only binding to IP is supported.
- */
err = -EINVAL;
if (addr_len < offsetofend(struct sockaddr, sa_family))
return err;
if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in))
return err;
- if (((struct sockaddr_in *)addr)->sin_port != htons(0))
- return err;
- return __inet_bind(sk, addr, addr_len, true, false);
+ if (((struct sockaddr_in *)addr)->sin_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
+ return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133)
return err;
- if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
- return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
/* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/
- return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
}
#endif /* CONFIG_INET */
@@ -4914,7 +5050,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
int err;
struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
- if (!seg6_validate_srh(srh, len))
+ if (!seg6_validate_srh(srh, len, false))
return -EINVAL;
switch (type) {
@@ -5925,7 +6061,7 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
return -EOPNOTSUPP;
if (unlikely(dev_net(skb->dev) != sock_net(sk)))
return -ENETUNREACH;
- if (unlikely(sk->sk_reuseport))
+ if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
return -ESOCKTNOSUPPORT;
if (sk_is_refcounted(sk) &&
unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
@@ -5983,52 +6119,7 @@ bool bpf_helper_changes_pkt_data(void *func)
return false;
}
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
-{
- switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_raw_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- default:
- break;
- }
-
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- switch (func_id) {
- case BPF_FUNC_spin_lock:
- return &bpf_spin_lock_proto;
- case BPF_FUNC_spin_unlock:
- return &bpf_spin_unlock_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- default:
- return NULL;
- }
-}
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -6119,6 +6210,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_sock_addr_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_sock_addr_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
return bpf_base_func_proto(func_id);
}
@@ -6163,8 +6270,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+ case BPF_FUNC_sk_cgroup_id:
+ return &bpf_sk_cgroup_id_proto;
+ case BPF_FUNC_sk_ancestor_cgroup_id:
+ return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock:
@@ -6193,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_csum_diff_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -6213,6 +6336,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_adjust_room_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -6335,9 +6460,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_setsockopt:
- return &bpf_setsockopt_proto;
+ return &bpf_sock_ops_setsockopt_proto;
case BPF_FUNC_getsockopt:
- return &bpf_getsockopt_proto;
+ return &bpf_sock_ops_getsockopt_proto;
case BPF_FUNC_sock_ops_cb_flags_set:
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
@@ -6384,6 +6509,26 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_push_data_proto;
case BPF_FUNC_msg_pop_data:
return &bpf_msg_pop_data_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -6504,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -6628,7 +6775,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
}
@@ -6640,7 +6787,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
default:
@@ -6770,6 +6917,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case offsetof(struct bpf_sock, protocol):
case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
+ case offsetof(struct bpf_sock, rx_queue_mapping):
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
case bpf_ctx_range(struct bpf_sock, dst_ip4):
@@ -6935,6 +7083,13 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
+ switch (off) {
+ case offsetof(struct xdp_md, egress_ifindex):
+ return false;
+ }
+ }
+
if (type == BPF_WRITE) {
if (bpf_prog_is_dev_bound(prog->aux)) {
switch (off) {
@@ -6990,6 +7145,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
@@ -7001,6 +7158,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;
@@ -7033,6 +7192,7 @@ static bool sock_addr_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
msg_src_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@@ -7063,10 +7223,6 @@ static bool sock_addr_is_valid_access(int off, int size,
return false;
}
break;
- case bpf_ctx_range(struct bpf_sock_addr, user_port):
- if (size != size_default)
- return false;
- break;
case offsetof(struct bpf_sock_addr, sk):
if (type != BPF_READ)
return false;
@@ -7197,6 +7353,11 @@ static bool sk_msg_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case offsetof(struct sk_msg_md, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
case bpf_ctx_range(struct sk_msg_md, family):
case bpf_ctx_range(struct sk_msg_md, remote_ip4):
case bpf_ctx_range(struct sk_msg_md, local_ip4):
@@ -7812,6 +7973,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
skc_state),
target_size));
break;
+ case offsetof(struct bpf_sock, rx_queue_mapping):
+#ifdef CONFIG_XPS
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_rx_queue_mapping,
+ sizeof_field(struct sock,
+ sk_rx_queue_mapping),
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
+ 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+#else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+ *target_size = 2;
+#endif
+ break;
}
return insn - insn_buf;
@@ -7882,6 +8060,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(struct xdp_rxq_info,
queue_index));
break;
+ case offsetof(struct xdp_md, egress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, txq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_txq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
}
return insn - insn_buf;
@@ -7962,8 +8150,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
+ int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct bpf_sock_addr, user_family):
@@ -7998,9 +8186,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sockaddr_in6, sin6_port));
BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
sizeof_field(struct sockaddr_in6, sin6_port));
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
- struct sockaddr_in6, uaddr,
- sin6_port, tmp_reg);
+ /* Account for sin6_port being smaller than user_port. */
+ port_size = min(port_size, BPF_LDST_BYTES(si));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
break;
case offsetof(struct bpf_sock_addr, family):
@@ -8531,6 +8721,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_msg_sg, size));
break;
+
+ case offsetof(struct sk_msg_md, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ break;
}
return insn - insn_buf;
@@ -8786,6 +8982,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
if (!reuse) {
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
/* reuseport_array has only sk with non NULL sk_reuseport_cb.
* The only (!reuse) case here is - the sk has already been
* unhashed (e.g. by close()), so treat it as -ENOENT.
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3eff84824c8b..d02df0b6d0d9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -31,8 +31,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
-
-static DEFINE_MUTEX(flow_dissector_mutex);
+#include <linux/bpf-netns.h>
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
@@ -70,54 +69,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
-int skb_flow_dissector_prog_query(const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
- u32 prog_id, prog_cnt = 0, flags = 0;
- struct bpf_prog *attached;
- struct net *net;
-
- if (attr->query.query_flags)
- return -EINVAL;
-
- net = get_net_ns_by_fd(attr->query.target_fd);
- if (IS_ERR(net))
- return PTR_ERR(net);
-
- rcu_read_lock();
- attached = rcu_dereference(net->flow_dissector_prog);
- if (attached) {
- prog_cnt = 1;
- prog_id = attached->aux->id;
- }
- rcu_read_unlock();
-
- put_net(net);
-
- if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
- return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
- return -EFAULT;
-
- if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
- return 0;
-
- if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
- return -EFAULT;
-
- return 0;
-}
-
-int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
- struct bpf_prog *prog)
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog)
{
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog *attached;
- struct net *net;
- int ret = 0;
-
- net = current->nsproxy->net_ns;
- mutex_lock(&flow_dissector_mutex);
if (net == &init_net) {
/* BPF flow dissector in the root namespace overrides
@@ -130,54 +86,29 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
for_each_net(ns) {
if (ns == &init_net)
continue;
- if (rcu_access_pointer(ns->flow_dissector_prog)) {
- ret = -EEXIST;
- goto out;
- }
+ if (rcu_access_pointer(ns->bpf.progs[type]))
+ return -EEXIST;
}
} else {
/* Make sure root flow dissector is not attached
* when attaching to the non-root namespace.
*/
- if (rcu_access_pointer(init_net.flow_dissector_prog)) {
- ret = -EEXIST;
- goto out;
- }
+ if (rcu_access_pointer(init_net.bpf.progs[type]))
+ return -EEXIST;
}
- attached = rcu_dereference_protected(net->flow_dissector_prog,
- lockdep_is_held(&flow_dissector_mutex));
- if (attached == prog) {
+ attached = rcu_dereference_protected(net->bpf.progs[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (attached == prog)
/* The same program cannot be attached twice */
- ret = -EINVAL;
- goto out;
- }
- rcu_assign_pointer(net->flow_dissector_prog, prog);
+ return -EINVAL;
+
+ rcu_assign_pointer(net->bpf.progs[type], prog);
if (attached)
bpf_prog_put(attached);
-out:
- mutex_unlock(&flow_dissector_mutex);
- return ret;
-}
-
-int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
-{
- struct bpf_prog *attached;
- struct net *net;
-
- net = current->nsproxy->net_ns;
- mutex_lock(&flow_dissector_mutex);
- attached = rcu_dereference_protected(net->flow_dissector_prog,
- lockdep_is_held(&flow_dissector_mutex));
- if (!attached) {
- mutex_unlock(&flow_dissector_mutex);
- return -ENOENT;
- }
- RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
- bpf_prog_put(attached);
- mutex_unlock(&flow_dissector_mutex);
return 0;
}
+#endif /* CONFIG_BPF_SYSCALL */
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
@@ -464,47 +395,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, void *data, int nhoff, int hlen,
+ int lse_index, bool *entropy_label)
{
- struct flow_dissector_key_keyid *key_keyid;
- struct mpls_label *hdr, _hdr[2];
- u32 entry, label;
+ struct mpls_label *hdr, _hdr;
+ u32 entry, label, bos;
if (!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
return FLOW_DISSECT_RET_OUT_GOOD;
+ if (lse_index >= FLOW_DIS_MPLS_MAX)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
hlen, &_hdr);
if (!hdr)
return FLOW_DISSECT_RET_OUT_BAD;
- entry = ntohl(hdr[0].entry);
+ entry = ntohl(hdr->entry);
label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
struct flow_dissector_key_mpls *key_mpls;
+ struct flow_dissector_mpls_lse *lse;
key_mpls = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS,
target_container);
- key_mpls->mpls_label = label;
- key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
- >> MPLS_LS_TTL_SHIFT;
- key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
- >> MPLS_LS_TC_SHIFT;
- key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
- >> MPLS_LS_S_SHIFT;
+ lse = &key_mpls->ls[lse_index];
+
+ lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ lse->mpls_bos = bos;
+ lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ lse->mpls_label = label;
+ dissector_set_mpls_lse(key_mpls, lse_index);
}
- if (label == MPLS_LABEL_ENTROPY) {
+ if (*entropy_label &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ struct flow_dissector_key_keyid *key_keyid;
+
key_keyid = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
target_container);
- key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+ key_keyid->keyid = cpu_to_be32(label);
}
- return FLOW_DISSECT_RET_OUT_GOOD;
+
+ *entropy_label = label == MPLS_LABEL_ENTROPY;
+
+ return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}
static enum flow_dissect_ret
@@ -963,6 +906,8 @@ bool __skb_flow_dissect(const struct net *net,
struct bpf_prog *attached = NULL;
enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ bool mpls_el = false;
+ int mpls_lse = 0;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -1014,11 +959,13 @@ bool __skb_flow_dissect(const struct net *net,
WARN_ON_ONCE(!net);
if (net) {
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+
rcu_read_lock();
- attached = rcu_dereference(init_net.flow_dissector_prog);
+ attached = rcu_dereference(init_net.bpf.progs[type]);
if (!attached)
- attached = rcu_dereference(net->flow_dissector_prog);
+ attached = rcu_dereference(net->bpf.progs[type]);
if (attached) {
struct bpf_flow_keys flow_keys;
@@ -1262,7 +1209,10 @@ proto_again:
case htons(ETH_P_MPLS_MC):
fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
target_container, data,
- nhoff, hlen);
+ nhoff, hlen, mpls_lse,
+ &mpls_el);
+ nhoff += sizeof(struct mpls_label);
+ mpls_lse++;
break;
case htons(ETH_P_FCOE):
if ((hlen - nhoff) < FCOE_HEADER_LEN) {
@@ -1838,5 +1788,4 @@ static int __init init_default_flow_dissectors(void)
ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
-
core_initcall(init_default_flow_dissectors);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index e951b743bed3..0cfc35e6be28 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -8,6 +8,7 @@
struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
+ int i;
rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
@@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
return NULL;
rule->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
return rule;
}
@@ -311,240 +317,159 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
}
EXPORT_SYMBOL(flow_block_cb_setup_simple);
-static LIST_HEAD(block_cb_list);
+static DEFINE_MUTEX(flow_indr_block_lock);
+static LIST_HEAD(flow_block_indr_list);
+static LIST_HEAD(flow_block_indr_dev_list);
-static struct rhashtable indr_setup_block_ht;
-
-struct flow_indr_block_cb {
- struct list_head list;
- void *cb_priv;
- flow_indr_block_bind_cb_t *cb;
- void *cb_ident;
-};
-
-struct flow_indr_block_dev {
- struct rhash_head ht_node;
- struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
-};
-
-static const struct rhashtable_params flow_indr_setup_block_ht_params = {
- .key_offset = offsetof(struct flow_indr_block_dev, dev),
- .head_offset = offsetof(struct flow_indr_block_dev, ht_node),
- .key_len = sizeof(struct net_device *),
+struct flow_indr_dev {
+ struct list_head list;
+ flow_indr_block_bind_cb_t *cb;
+ void *cb_priv;
+ refcount_t refcnt;
+ struct rcu_head rcu;
};
-static struct flow_indr_block_dev *
-flow_indr_block_dev_lookup(struct net_device *dev)
-{
- return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
- flow_indr_setup_block_ht_params);
-}
-
-static struct flow_indr_block_dev *
-flow_indr_block_dev_get(struct net_device *dev)
+static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
{
- struct flow_indr_block_dev *indr_dev;
+ struct flow_indr_dev *indr_dev;
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (indr_dev)
- goto inc_ref;
-
- indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+ indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL);
if (!indr_dev)
return NULL;
- INIT_LIST_HEAD(&indr_dev->cb_list);
- indr_dev->dev = dev;
- if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- flow_indr_setup_block_ht_params)) {
- kfree(indr_dev);
- return NULL;
- }
+ indr_dev->cb = cb;
+ indr_dev->cb_priv = cb_priv;
+ refcount_set(&indr_dev->refcnt, 1);
-inc_ref:
- indr_dev->refcnt++;
return indr_dev;
}
-static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
-{
- if (--indr_dev->refcnt)
- return;
-
- rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- flow_indr_setup_block_ht_params);
- kfree(indr_dev);
-}
-
-static struct flow_indr_block_cb *
-flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
- flow_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct flow_indr_block_cb *indr_block_cb;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- if (indr_block_cb->cb == cb &&
- indr_block_cb->cb_ident == cb_ident)
- return indr_block_cb;
- return NULL;
-}
-
-static struct flow_indr_block_cb *
-flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb, void *cb_ident)
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
{
- struct flow_indr_block_cb *indr_block_cb;
+ struct flow_indr_dev *indr_dev;
- indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (indr_block_cb)
- return ERR_PTR(-EEXIST);
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) {
+ if (indr_dev->cb == cb &&
+ indr_dev->cb_priv == cb_priv) {
+ refcount_inc(&indr_dev->refcnt);
+ mutex_unlock(&flow_indr_block_lock);
+ return 0;
+ }
+ }
- indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
- if (!indr_block_cb)
- return ERR_PTR(-ENOMEM);
+ indr_dev = flow_indr_dev_alloc(cb, cb_priv);
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return -ENOMEM;
+ }
- indr_block_cb->cb_priv = cb_priv;
- indr_block_cb->cb = cb;
- indr_block_cb->cb_ident = cb_ident;
- list_add(&indr_block_cb->list, &indr_dev->cb_list);
+ list_add(&indr_dev->list, &flow_block_indr_dev_list);
+ mutex_unlock(&flow_indr_block_lock);
- return indr_block_cb;
+ return 0;
}
+EXPORT_SYMBOL(flow_indr_dev_register);
-static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
+static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv,
+ struct list_head *cleanup_list)
{
- list_del(&indr_block_cb->list);
- kfree(indr_block_cb);
-}
-
-static DEFINE_MUTEX(flow_indr_block_cb_lock);
+ struct flow_block_cb *this, *next;
-static void flow_block_cmd(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb, void *cb_priv,
- enum flow_block_command command)
-{
- struct flow_indr_block_entry *entry;
-
- mutex_lock(&flow_indr_block_cb_lock);
- list_for_each_entry(entry, &block_cb_list, list) {
- entry->cb(dev, cb, cb_priv, command);
+ list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
+ if (this->cb == setup_cb &&
+ this->cb_priv == cb_priv) {
+ list_move(&this->indr.list, cleanup_list);
+ return;
+ }
}
- mutex_unlock(&flow_indr_block_cb_lock);
}
-int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+static void flow_block_indr_notify(struct list_head *cleanup_list)
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
- int err;
-
- indr_dev = flow_indr_block_dev_get(dev);
- if (!indr_dev)
- return -ENOMEM;
-
- indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
- err = PTR_ERR_OR_ZERO(indr_block_cb);
- if (err)
- goto err_dev_put;
-
- flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
- FLOW_BLOCK_BIND);
-
- return 0;
+ struct flow_block_cb *this, *next;
-err_dev_put:
- flow_indr_block_dev_put(indr_dev);
- return err;
-}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
-
-int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
-{
- int err;
-
- rtnl_lock();
- err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
- rtnl_unlock();
-
- return err;
+ list_for_each_entry_safe(this, next, cleanup_list, indr.list) {
+ list_del(&this->indr.list);
+ this->indr.cleanup(this);
+ }
}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
-void __flow_indr_block_cb_unregister(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ flow_setup_cb_t *setup_cb)
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
+ struct flow_indr_dev *this, *next, *indr_dev = NULL;
+ LIST_HEAD(cleanup_list);
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) {
+ if (this->cb == cb &&
+ this->cb_priv == cb_priv &&
+ refcount_dec_and_test(&this->refcnt)) {
+ indr_dev = this;
+ list_del(&indr_dev->list);
+ break;
+ }
+ }
- indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (!indr_block_cb)
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
return;
+ }
- flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
- FLOW_BLOCK_UNBIND);
+ __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list);
+ mutex_unlock(&flow_indr_block_lock);
- flow_indr_block_cb_del(indr_block_cb);
- flow_indr_block_dev_put(indr_dev);
+ flow_block_indr_notify(&cleanup_list);
+ kfree(indr_dev);
}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
+EXPORT_SYMBOL(flow_indr_dev_unregister);
-void flow_indr_block_cb_unregister(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+static void flow_block_indr_init(struct flow_block_cb *flow_block,
+ struct flow_block_offload *bo,
+ struct net_device *dev, void *data,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- rtnl_lock();
- __flow_indr_block_cb_unregister(dev, cb, cb_ident);
- rtnl_unlock();
+ flow_block->indr.binder_type = bo->binder_type;
+ flow_block->indr.data = data;
+ flow_block->indr.dev = dev;
+ flow_block->indr.cleanup = cleanup;
}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
-void flow_indr_block_call(struct net_device *dev,
- struct flow_block_offload *bo,
- enum flow_block_command command,
- enum tc_setup_type type)
+static void __flow_block_indr_binding(struct flow_block_offload *bo,
+ struct net_device *dev, void *data,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
-
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ struct flow_block_cb *block_cb;
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo);
+ list_for_each_entry(block_cb, &bo->cb_list, list) {
+ switch (bo->command) {
+ case FLOW_BLOCK_BIND:
+ flow_block_indr_init(block_cb, bo, dev, data, cleanup);
+ list_add(&block_cb->indr.list, &flow_block_indr_list);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ list_del(&block_cb->indr.list);
+ break;
+ }
+ }
}
-EXPORT_SYMBOL_GPL(flow_indr_block_call);
-void flow_indr_add_block_cb(struct flow_indr_block_entry *entry)
+int flow_indr_dev_setup_offload(struct net_device *dev,
+ enum tc_setup_type type, void *data,
+ struct flow_block_offload *bo,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- mutex_lock(&flow_indr_block_cb_lock);
- list_add_tail(&entry->list, &block_cb_list);
- mutex_unlock(&flow_indr_block_cb_lock);
-}
-EXPORT_SYMBOL_GPL(flow_indr_add_block_cb);
+ struct flow_indr_dev *this;
-void flow_indr_del_block_cb(struct flow_indr_block_entry *entry)
-{
- mutex_lock(&flow_indr_block_cb_lock);
- list_del(&entry->list);
- mutex_unlock(&flow_indr_block_cb_lock);
-}
-EXPORT_SYMBOL_GPL(flow_indr_del_block_cb);
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry(this, &flow_block_indr_dev_list, list)
+ this->cb(dev, this->cb_priv, type, bo);
-static int __init init_flow_indr_rhashtable(void)
-{
- return rhashtable_init(&indr_setup_block_ht,
- &flow_indr_setup_block_ht_params);
+ __flow_block_indr_binding(bo, dev, data, cleanup);
+ mutex_unlock(&flow_indr_block_lock);
+
+ return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0;
}
-subsys_initcall(init_flow_indr_rhashtable);
+EXPORT_SYMBOL(flow_indr_dev_setup_offload);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1d653fbfcf52..e491b083b348 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -6,7 +6,7 @@
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * See Documentation/networking/gen_stats.txt
+ * See Documentation/networking/gen_stats.rst
*/
#include <linux/types.h>
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index f153e0601838..75431ca9300f 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock);
static unsigned char default_operstate(const struct net_device *dev)
{
+ if (netif_testing(dev))
+ return IF_OPER_TESTING;
+
if (!netif_carrier_ok(dev))
return (dev->ifindex != dev_get_iflink(dev) ?
IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
@@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev)
write_lock_bh(&dev_base_lock);
switch(dev->link_mode) {
+ case IF_LINK_MODE_TESTING:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
-
case IF_LINK_MODE_DEFAULT:
default:
break;
@@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev)
void linkwatch_init_dev(struct net_device *dev)
{
/* Handle pre-registration link state changes */
- if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
+ netif_testing(dev))
rfc2863_policy(dev);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 39d37d0ef575..ef6b5a8f629c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1082,8 +1082,8 @@ static void neigh_timer_handler(struct timer_list *t)
}
if (neigh->nud_state & NUD_IN_TIMER) {
- if (time_before(next, jiffies + HZ/2))
- next = jiffies + HZ/2;
+ if (time_before(next, jiffies + HZ/100))
+ next = jiffies + HZ/100;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family)
}
const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
[NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
[NDA_IFINDEX] = { .type = NLA_U32 },
[NDA_MASTER] = { .type = NLA_U32 },
[NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
};
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1956,6 +1958,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
}
+ if (protocol)
+ neigh->protocol = protocol;
+
if (ndm->ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
@@ -1969,9 +1974,6 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
NETLINK_CB(skb).portid, extack);
- if (protocol)
- neigh->protocol = protocol;
-
neigh_release(neigh);
out:
@@ -3379,7 +3381,7 @@ EXPORT_SYMBOL(neigh_app_ns);
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
static int proc_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
struct ctl_table tmp = *ctl;
@@ -3443,8 +3445,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
}
static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct ctl_table tmp = *ctl;
int ret;
@@ -3457,8 +3459,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3467,8 +3469,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3479,8 +3480,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3489,8 +3490,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
}
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3500,8 +3500,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
@@ -3510,8 +3510,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
}
static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct neigh_parms *p = ctl->extra2;
int ret;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index cf0215734ceb..e353b822bb15 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -80,7 +80,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
unsigned long new;
- int ret = -EINVAL;
+ int ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev,
}
static DEVICE_ATTR_RO(duplex);
+static ssize_t testing_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_testing(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(testing);
+
static ssize_t dormant_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -260,7 +272,7 @@ static const char *const operstates[] = {
"notpresent", /* currently unused */
"down",
"lowerlayerdown",
- "testing", /* currently unused */
+ "testing",
"dormant",
"up"
};
@@ -355,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- dev->gro_flush_timeout = val;
+ WRITE_ONCE(dev->gro_flush_timeout, val);
return 0;
}
@@ -370,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+ WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ return 0;
+}
+
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -524,6 +553,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_speed.attr,
&dev_attr_duplex.attr,
&dev_attr_dormant.attr,
+ &dev_attr_testing.attr,
&dev_attr_operstate.attr,
&dev_attr_carrier_changes.attr,
&dev_attr_ifalias.attr,
@@ -532,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_defer_hard_irqs.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
@@ -1774,12 +1805,12 @@ static struct class net_class __ro_after_init = {
#ifdef CONFIG_OF_NET
static int of_dev_node_match(struct device *dev, const void *data)
{
- int ret = 0;
-
- if (dev->parent)
- ret = dev->parent->of_node == data;
+ for (; dev; dev = dev->parent) {
+ if (dev->of_node == data)
+ return 1;
+ }
- return ret == 0 ? dev->of_node == data : ret;
+ return 0;
}
/*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 190ca66a383b..dcd61aca343e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1353,12 +1353,13 @@ static void netns_put(struct ns_common *ns)
put_net(to_net_ns(ns));
}
-static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b4c87fe31be2..41b24cd31562 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -127,10 +127,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
+ while ((p = css_task_iter_next(&it)))
update_classid_task(p, cs->classid);
- cond_resched();
- }
css_task_iter_end(&it);
return 0;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 849380a622ef..093e90e52bc2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644);
#define np_notice(np, fmt, ...) \
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
-static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
- struct netdev_queue *txq)
+static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netdev_queue *txq)
{
- int status = NETDEV_TX_OK;
+ netdev_tx_t status = NETDEV_TX_OK;
netdev_features_t features;
features = netif_skb_features(skb);
@@ -304,20 +305,22 @@ static int netpoll_owner_active(struct net_device *dev)
}
/* call with IRQ disabled */
-void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
- struct net_device *dev)
+static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status = NETDEV_TX_BUSY;
+ netdev_tx_t status = NETDEV_TX_BUSY;
+ struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
lockdep_assert_irqs_disabled();
- npinfo = rcu_dereference_bh(np->dev->npinfo);
+ dev = np->dev;
+ npinfo = rcu_dereference_bh(dev->npinfo);
+
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return;
+ return NET_XMIT_DROP;
}
/* don't get messages out of order, and no recursion */
@@ -356,8 +359,25 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
+ return NETDEV_TX_OK;
+}
+
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+ unsigned long flags;
+ netdev_tx_t ret;
+
+ if (unlikely(!np)) {
+ dev_kfree_skb_irq(skb);
+ ret = NET_XMIT_DROP;
+ } else {
+ local_irq_save(flags);
+ ret = __netpoll_send_skb(np, skb);
+ local_irq_restore(flags);
+ }
+ return ret;
}
-EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+EXPORT_SYMBOL(netpoll_send_skb);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 8881dd943dd0..9bd4cab7d510 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -236,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct task_struct *p;
struct cgroup_subsys_state *css;
+ cgroup_sk_alloc_disable();
+
cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->id;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 08e2811b5274..b53b6d38c4df 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -56,7 +56,7 @@
* Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
*
* 021124 Finished major redesign and rewrite for new functionality.
- * See Documentation/networking/pktgen.txt for how to use this.
+ * See Documentation/networking/pktgen.rst for how to use this.
*
* The new operation:
* For each CPU one thread/process is created at start. This process checks
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 709ebbf8ab5b..2269199c5891 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
switch (transition) {
case IF_OPER_UP:
if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_TESTING ||
operstate == IF_OPER_UNKNOWN) &&
- !netif_dormant(dev))
+ !netif_dormant(dev) && !netif_testing(dev))
operstate = IF_OPER_UP;
break;
+ case IF_OPER_TESTING:
+ if (operstate == IF_OPER_UP ||
+ operstate == IF_OPER_UNKNOWN)
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_OPER_DORMANT:
if (operstate == IF_OPER_UP ||
operstate == IF_OPER_UNKNOWN)
@@ -3990,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct net_device *dev;
- int err = -EINVAL;
__u8 *addr;
+ int err;
u16 vid;
if (!netlink_capable(skb, CAP_NET_ADMIN))
diff --git a/net/core/scm.c b/net/core/scm.c
index dc6fed1f221c..875df1c2989d 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
- struct cmsghdr __user *cm
- = (__force struct cmsghdr __user *)msg->msg_control;
- struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
- int err;
- if (MSG_CMSG_COMPAT & msg->msg_flags)
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
return put_cmsg_compat(msg, level, type, len, data);
- if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
@@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
- cmhdr.cmsg_level = level;
- cmhdr.cmsg_type = type;
- cmhdr.cmsg_len = cmlen;
-
- err = -EFAULT;
- if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
- goto out;
- if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
- goto out;
- cmlen = CMSG_SPACE(len);
- if (msg->msg_controllen < cmlen)
- cmlen = msg->msg_controllen;
+
+ if (msg->msg_control_is_user) {
+ struct cmsghdr __user *cm = msg->msg_control_user;
+ struct cmsghdr cmhdr;
+
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr) ||
+ copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm)))
+ return -EFAULT;
+ } else {
+ struct cmsghdr *cm = msg->msg_control;
+
+ cm->cmsg_level = level;
+ cm->cmsg_type = type;
+ cm->cmsg_len = cmlen;
+ memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm));
+ }
+
+ cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
- err = 0;
-out:
- return err;
+ return 0;
}
EXPORT_SYMBOL(put_cmsg);
@@ -277,78 +280,90 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter
}
EXPORT_SYMBOL(put_cmsg_scm_timestamping);
+static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags)
+{
+ struct socket *sock;
+ int new_fd;
+ int error;
+
+ error = security_file_receive(file);
+ if (error)
+ return error;
+
+ new_fd = get_unused_fd_flags(o_flags);
+ if (new_fd < 0)
+ return new_fd;
+
+ error = put_user(new_fd, ufd);
+ if (error) {
+ put_unused_fd(new_fd);
+ return error;
+ }
+
+ /* Bump the usage count and install the file. */
+ sock = sock_from_file(file, &error);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+ fd_install(new_fd, get_file(file));
+ return 0;
+}
+
+static int scm_max_fds(struct msghdr *msg)
+{
+ if (msg->msg_controllen <= sizeof(struct cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
+}
+
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
{
struct cmsghdr __user *cm
= (__force struct cmsghdr __user*)msg->msg_control;
-
- int fdmax = 0;
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
+ int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_USER_DATA(cm);
int err = 0, i;
- if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
scm_detach_fds_compat(msg, scm);
return;
}
- if (msg->msg_controllen > sizeof(struct cmsghdr))
- fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
- / sizeof(int));
-
- if (fdnum < fdmax)
- fdmax = fdnum;
+ /* no use for FD passing from kernel space callers */
+ if (WARN_ON_ONCE(!msg->msg_control_is_user))
+ return;
- for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
- i++, cmfptr++)
- {
- struct socket *sock;
- int new_fd;
- err = security_file_receive(fp[i]);
+ for (i = 0; i < fdmax; i++) {
+ err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err)
break;
- err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
- ? O_CLOEXEC : 0);
- if (err < 0)
- break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- sock = sock_from_file(fp[i], &err);
- if (sock) {
- sock_update_netprioidx(&sock->sk->sk_cgrp_data);
- sock_update_classid(&sock->sk->sk_cgrp_data);
- }
- fd_install(new_fd, get_file(fp[i]));
}
- if (i > 0)
- {
- int cmlen = CMSG_LEN(i*sizeof(int));
+ if (i > 0) {
+ int cmlen = CMSG_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
- cmlen = CMSG_SPACE(i*sizeof(int));
+ cmlen = CMSG_SPACE(i * sizeof(int));
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum || (fdnum && fdmax <= 0))
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 7b6b1d2c3d10..b5bc680d4755 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -5,7 +5,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/cryptohash.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/random.h>
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7e29590482ce..b8afefe6f6b6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
const char msg[])
{
- pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
(unsigned long)skb->tail, (unsigned long)skb->end,
skb->dev ? skb->dev->name : "<NULL>");
@@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
return 0;
}
-EXPORT_SYMBOL_GPL(skb_gro_receive_list);
/**
* skb_segment - Perform protocol segmentation on skb.
@@ -4191,7 +4190,6 @@ done:
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE 8
@@ -6087,13 +6085,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
/**
* __skb_ext_alloc - allocate a new skb extensions storage
*
+ * @flags: See kmalloc().
+ *
* Returns the newly allocated pointer. The pointer can later attached to a
* skb via __skb_ext_set().
* Note: caller must handle the skb_ext as an opaque data.
*/
-struct skb_ext *__skb_ext_alloc(void)
+struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
- struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
if (new) {
memset(new->offset, 0, sizeof(new->offset));
@@ -6188,7 +6188,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
} else {
newoff = SKB_EXT_CHUNKSIZEOF(*new);
- new = __skb_ext_alloc();
+ new = __skb_ext_alloc(GFP_ATOMIC);
if (!new)
return NULL;
}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index c479372f2cd2..351afbf6bfba 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -7,6 +7,7 @@
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
{
@@ -682,13 +683,75 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
return container_of(parser, struct sk_psock, parser);
}
-static void sk_psock_verdict_apply(struct sk_psock *psock,
- struct sk_buff *skb, int verdict)
+static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
+ sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ if (unlikely(!sk_other)) {
+ kfree_skb(skb);
+ return;
+ }
+ psock_other = sk_psock(sk_other);
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ ingress = tcp_skb_bpf_ingress(skb);
+ if ((!ingress && sock_writeable(sk_other)) ||
+ (ingress &&
+ atomic_read(&sk_other->sk_rmem_alloc) <=
+ sk_other->sk_rcvbuf)) {
+ if (!ingress)
+ skb_set_owner_w(skb, sk_other);
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
+ } else {
+ kfree_skb(skb);
+ }
+}
+
+static void sk_psock_tls_verdict_apply(struct sk_psock *psock,
+ struct sk_buff *skb, int verdict)
+{
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_psock_skb_redirect(psock, skb);
+ break;
+ case __SK_PASS:
+ case __SK_DROP:
+ default:
+ break;
+ }
+}
+
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog;
+ int ret = __SK_PASS;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ tcp_skb_bpf_redirect_clear(skb);
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ }
+ rcu_read_unlock();
+ sk_psock_tls_verdict_apply(psock, skb, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+ struct sk_buff *skb, int verdict)
+{
+ struct sock *sk_other;
+
switch (verdict) {
case __SK_PASS:
sk_other = psock->sk;
@@ -707,25 +770,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
}
goto out_free;
case __SK_REDIRECT:
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
- if (unlikely(!sk_other))
- goto out_free;
- psock_other = sk_psock(sk_other);
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
- goto out_free;
- ingress = tcp_skb_bpf_ingress(skb);
- if ((!ingress && sock_writeable(sk_other)) ||
- (ingress &&
- atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf)) {
- if (!ingress)
- skb_set_owner_w(skb, sk_other);
- skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_work(&psock_other->work);
- break;
- }
- /* fall-through */
+ sk_psock_skb_redirect(psock, skb);
+ break;
case __SK_DROP:
/* fall-through */
default:
@@ -779,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
- write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
- write_unlock_bh(&sk->sk_callback_lock);
+ if (tls_sw_has_ctx_rx(sk)) {
+ psock->parser.saved_data_ready(sk);
+ } else {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->parser.strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
}
rcu_read_unlock();
}
diff --git a/net/core/sock.c b/net/core/sock.c
index ce1d8dce9b7a..6c4acf1f0220 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -594,6 +594,20 @@ out:
return ret;
}
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
static int sock_setbindtodevice(struct sock *sk, char __user *optval,
int optlen)
{
@@ -634,10 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
goto out;
}
- lock_sock(sk);
- ret = sock_setbindtodevice_locked(sk, index);
- release_sock(sk);
-
+ return sock_bindtoindex(sk, index, true);
out:
#endif
@@ -712,6 +723,111 @@ bool sk_mc_loop(struct sock *sk)
}
EXPORT_SYMBOL(sk_mc_loop);
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_lingertime = 0;
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ lock_sock(sk);
+ sk->sk_priority = priority;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ lock_sock(sk);
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ sk->sk_sndtimeo = secs * HZ;
+ else
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ sock_reset_flag(sk, SOCK_TSTAMP_NEW);
+ }
+}
+
+void sock_enable_timestamps(struct sock *sk)
+{
+ lock_sock(sk);
+ __sock_set_timestamps(sk, true, false, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_enable_timestamps);
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
@@ -808,30 +924,7 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
- /* Ensure val * 2 fits into an int, to prevent max_t()
- * from treating it as a negative value.
- */
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
+ __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
break;
case SO_RCVBUFFORCE:
@@ -843,9 +936,8 @@ set_rcvbuf:
/* No negative values (to prevent underflow, as val will be
* multiplied by 2).
*/
- if (val < 0)
- val = 0;
- goto set_rcvbuf;
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
case SO_KEEPALIVE:
if (sk->sk_prot->keepalive)
@@ -903,28 +995,17 @@ set_rcvbuf:
break;
case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
case SO_TIMESTAMPNS_NEW:
- if (valbool) {
- if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- else
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
- if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- else
- sock_set_flag(sk, SOCK_RCVTSTAMPNS);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
- }
+ __sock_set_timestamps(sk, valbool, true, true);
break;
-
case SO_TIMESTAMPING_NEW:
sock_set_flag(sk, SOCK_TSTAMP_NEW);
/* fall through */
@@ -1152,27 +1233,35 @@ set_rcvbuf:
break;
case SO_TXTIME:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else if (optlen != sizeof(struct sock_txtime)) {
+ if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL;
+ break;
} else if (copy_from_user(&sk_txtime, optval,
sizeof(struct sock_txtime))) {
ret = -EFAULT;
+ break;
} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL;
- } else {
- sock_valbool_flag(sk, SOCK_TXTIME, true);
- sk->sk_clockid = sk_txtime.clockid;
- sk->sk_txtime_deadline_mode =
- !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
- sk->sk_txtime_report_errors =
- !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+ }
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
}
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
break;
case SO_BINDTOIFINDEX:
- ret = sock_setbindtodevice_locked(sk, val);
+ ret = sock_bindtoindex_locked(sk, val);
break;
default:
@@ -1872,7 +1961,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* as not suitable for copying when cloning.
*/
if (sk_user_data_is_nocopy(newsk))
- RCU_INIT_POINTER(newsk->sk_user_data, NULL);
+ newsk->sk_user_data = NULL;
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
@@ -2364,7 +2453,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
@@ -3626,3 +3714,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index b08dfae10f88..00a26cf2cfe9 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
- return __sock_map_lookup_elem(map, *(u32 *)key);
+ struct sock *sk;
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk || !sk_fullsock(sk))
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
}
static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
@@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
static void *sock_hash_lookup(struct bpf_map *map, void *key)
{
- return __sock_hash_lookup_elem(map, key);
+ struct sock *sk;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk || !sk_fullsock(sk))
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
}
static void sock_hash_release_progs(struct bpf_map *map)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9f9e00ba3ad7..f93f8ace6c56 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -23,6 +23,7 @@
#include <net/pkt_sched.h>
static int two __maybe_unused = 2;
+static int three = 3;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
@@ -39,13 +40,14 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
* IPv6: reset all settings to default
* 1 - Both inherit all current settings from init_net
* 2 - Both reset all settings to default
+ * 3 - Both inherit all settings from current netns
*/
int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
@@ -115,8 +117,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
static DEFINE_MUTEX(flow_limit_update_mutex);
static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
struct softnet_data *sd;
@@ -127,7 +128,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
return -ENOMEM;
if (write) {
- ret = cpumask_parse_user(buffer, *lenp, mask);
+ ret = cpumask_parse(buffer, mask);
if (ret)
goto done;
@@ -180,10 +181,7 @@ write_unlock:
}
if (len < *lenp)
kbuf[len++] = '\n';
- if (copy_to_user(buffer, kbuf, len)) {
- ret = -EFAULT;
- goto done;
- }
+ memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
}
@@ -194,8 +192,7 @@ done:
}
static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
int ret;
@@ -217,7 +214,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_SCHED
static int set_default_qdisc(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
struct ctl_table tbl = {
@@ -236,7 +233,7 @@ static int set_default_qdisc(struct ctl_table *table, int write,
#endif
static int proc_do_dev_weight(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -251,7 +248,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
}
static int proc_do_rss_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
@@ -264,7 +261,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
#ifdef CONFIG_BPF_JIT
static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, jit_enable = *(int *)table->data;
@@ -291,8 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
# ifdef CONFIG_HAVE_EBPF_JIT
static int
proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -303,8 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -560,7 +555,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = &three,
},
{
.procname = "high_order_alloc_disable",
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4c7ea85486af..90f44f382115 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -11,11 +11,13 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
+#include <linux/bug.h>
#include <net/page_pool.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -108,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator)
mutex_unlock(&mem_id_lock);
}
-static void mem_id_disconnect(int id)
-{
- struct xdp_mem_allocator *xa;
-
- mutex_lock(&mem_id_lock);
-
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
- return;
- }
-
- trace_mem_disconnect(xa);
-
- if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
- mutex_unlock(&mem_id_lock);
-}
-
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -142,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
- if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
- return mem_id_disconnect(id);
-
if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
@@ -300,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->mem.type = type;
if (!allocator) {
- if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ if (type == MEM_TYPE_PAGE_POOL)
return -EINVAL; /* Setup time check page_pool req */
return 0;
}
@@ -357,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
- * of xdp_frames/pages in those cases.
+ * of xdp_frames/pages in those cases. This path is never used by the
+ * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
+ * the switch-statement.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- unsigned long handle)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -382,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
- case MEM_TYPE_ZERO_COPY:
- /* NB! Only valid from an xdp_buff! */
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
- rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+ __xdp_return(xdpf->data, &xdpf->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
void xdp_return_buff(struct xdp_buff *xdp)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+ __xdp_return(xdp->data, &xdp->rxq->mem, true);
}
-EXPORT_SYMBOL_GPL(xdp_return_buff);
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
@@ -492,7 +464,14 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->metasize = metasize;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- xdp_return_buff(xdp);
+ xsk_buff_free(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+ WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);