aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c12
-rw-r--r--net/core/dev.c125
-rw-r--r--net/core/dev_ioctl.c20
-rw-r--r--net/core/devlink.c10
-rw-r--r--net/core/drop_monitor.c11
-rw-r--r--net/core/ethtool.c12
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c132
-rw-r--r--net/core/flow_dissector.c25
-rw-r--r--net/core/gen_estimator.c11
-rw-r--r--net/core/link_watch.c5
-rw-r--r--net/core/lwt_bpf.c15
-rw-r--r--net/core/lwtunnel.c4
-rw-r--r--net/core/neighbour.c67
-rw-r--r--net/core/net-procfs.c38
-rw-r--r--net/core/net-sysfs.c125
-rw-r--r--net/core/net_namespace.c48
-rw-r--r--net/core/netpoll.c22
-rw-r--r--net/core/page_pool.c6
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/rtnetlink.c29
-rw-r--r--net/core/secure_seq.c16
-rw-r--r--net/core/skbuff.c55
-rw-r--r--net/core/skmsg.c111
-rw-r--r--net/core/sock.c61
-rw-r--r--net/core/sock_map.c2
-rw-r--r--net/core/sock_reuseport.c2
-rw-r--r--net/core/stream.c3
-rw-r--r--net/core/sysctl_net_core.c2
29 files changed, 719 insertions, 256 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 189ad4c73a3f..b0488f30f2c4 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -700,8 +700,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
- return __skb_datagram_iter(skb, offset, to, len, true,
- csum_and_copy_to_iter, csump);
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
+
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
+
+ *csump = csdata.csum;
+ return 0;
}
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index 20c7fd7b8b4b..a03036456221 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2589,6 +2589,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ dev_qdisc_change_real_num_tx(dev, txq);
+
dev->real_num_tx_queues = txq;
if (disabling) {
@@ -2787,6 +2789,12 @@ static u16 skb_tx_hash(const struct net_device *dev,
qoffset = sb_dev->tc_to_txq[tc].offset;
qcount = sb_dev->tc_to_txq[tc].count;
+ if (unlikely(!qcount)) {
+ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+ sb_dev->name, qoffset, tc);
+ qoffset = 0;
+ qcount = dev->real_num_tx_queues;
+ }
}
if (skb_rx_queue_recorded(skb)) {
@@ -3384,7 +3392,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (q->flags & TCQ_F_NOLOCK) {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- qdisc_run(q);
+ if (likely(!netif_xmit_frozen_or_stopped(txq)))
+ qdisc_run(q);
if (unlikely(to_free))
kfree_skb_list(to_free);
@@ -3480,7 +3489,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
WARN_ON(!skb_dst(skb));
skb_dst_force(skb);
netif_rx_ni(skb);
@@ -3756,7 +3766,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- if (txq->xmit_lock_owner != cpu) {
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
if (dev_xmit_recursion())
goto recursion_alert;
@@ -4515,25 +4528,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
+ rcu_read_lock();
+
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) {
- root_lock = qdisc_lock(q);
- spin_lock(root_lock);
- }
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ /* There is a synchronize_net() between
+ * STATE_DEACTIVATED flag being set and
+ * qdisc_reset()/some_qdisc_is_busy() in
+ * dev_deactivate(), so we can safely bail out
+ * early here to avoid data race between
+ * qdisc_deactivate() and some_qdisc_is_busy()
+ * for lockless qdisc.
+ */
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ continue;
+ }
+
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
}
+
+ rcu_read_unlock();
}
xfrm_dev_backlog(sd);
@@ -5275,10 +5306,11 @@ static void gro_normal_list(struct napi_struct *napi)
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
{
list_add_tail(&skb->list, &napi->rx_list);
- if (++napi->rx_count >= gro_normal_batch)
+ napi->rx_count += segs;
+ if (napi->rx_count >= gro_normal_batch)
gro_normal_list(napi);
}
@@ -5317,7 +5349,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
}
out:
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
return NET_RX_SUCCESS;
}
@@ -5394,7 +5426,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
return head;
}
-static void skb_gro_reset_offset(struct sk_buff *skb)
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
@@ -5405,7 +5437,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
pinfo->nr_frags &&
- !PageHighMem(skb_frag_page(frag0))) {
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
skb_frag_size(frag0),
@@ -5608,7 +5641,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
{
switch (ret) {
case GRO_NORMAL:
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -5638,7 +5671,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, 0);
ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
@@ -5696,7 +5729,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -5731,7 +5764,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
napi->skb = NULL;
skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, hlen);
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
@@ -5951,11 +5984,18 @@ EXPORT_SYMBOL(napi_schedule_prep);
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
- * Variant of __napi_schedule() assuming hard irqs are masked
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -8143,6 +8183,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
}
EXPORT_SYMBOL(dev_set_mac_address);
+static DECLARE_RWSEM(dev_addr_sem);
+
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ ret = dev_set_mac_address(dev, sa, extack);
+ up_write(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_get_mac_address);
+
/**
* dev_change_carrier - Change device carrier
* @dev: device
@@ -8692,6 +8774,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
return features;
}
@@ -10073,7 +10160,7 @@ static void __net_exit default_device_exit(struct net *net)
continue;
/* Leave virtual devices for the generic cleanup */
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 5163d900bb4f..69fb9219d51d 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -122,17 +122,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
ifr->ifr_mtu = dev->mtu;
return 0;
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0,
- sizeof(ifr->ifr_hwaddr.sa_data));
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof(ifr->ifr_hwaddr.sa_data),
- (size_t)dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
case SIOCGIFSLAVE:
err = -EINVAL;
break;
@@ -246,7 +235,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFHWADDR:
if (dev->addr_len > sizeof(struct sockaddr))
return -EINVAL;
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -396,6 +385,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
*/
switch (cmd) {
+ case SIOCGIFHWADDR:
+ dev_load(net, ifr->ifr_name);
+ ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name);
+ if (colon)
+ *colon = ':';
+ return ret;
/*
* These ioctl calls:
* - can be done by all.
@@ -405,7 +400,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
- case SIOCGIFHWADDR:
case SIOCGIFSLAVE:
case SIOCGIFMAP:
case SIOCGIFINDEX:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 26c8993a17ae..0ac02cab3087 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -562,6 +562,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
+ /* Hold rtnl lock while accessing port's netdev attributes. */
+ rtnl_lock();
spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
@@ -588,6 +590,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
goto nla_put_failure_type_locked;
}
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
@@ -596,6 +599,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_failure_type_locked:
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -1144,7 +1148,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
pool_index, &cur, &max);
if (err && err != -EOPNOTSUPP)
- return err;
+ goto sb_occ_get_failure;
if (!err) {
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
goto nla_put_failure;
@@ -1157,8 +1161,10 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
return 0;
nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index af0130039f37..e8e8389ddc96 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -277,13 +277,17 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
rcu_read_lock();
list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ struct net_device *dev;
+
/*
* only add a note to our monitor buffer if:
* 1) this is the dev we received on
* 2) its after the last_rx delta
* 3) our rx_dropped count has gone up
*/
- if ((new_stat->dev == napi->dev) &&
+ /* Paired with WRITE_ONCE() in dropmon_net_event() */
+ dev = READ_ONCE(new_stat->dev);
+ if ((dev == napi->dev) &&
(time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
(napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
trace_drop_common(NULL, NULL);
@@ -1497,7 +1501,10 @@ static int dropmon_net_event(struct notifier_block *ev_block,
mutex_lock(&net_dm_mutex);
list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
if (new_stat->dev == dev) {
- new_stat->dev = NULL;
+
+ /* Paired with READ_ONCE() in trace_napi_poll_hit() */
+ WRITE_ONCE(new_stat->dev, NULL);
+
if (trace_state == TRACE_OFF) {
list_del_rcu(&new_stat->list);
kfree_rcu(new_stat, rcu);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index cd9bc67381b2..cbd1885f2459 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -589,7 +589,7 @@ store_link_ksettings_for_user(void __user *to,
{
struct ethtool_link_usettings link_usettings;
- memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+ memcpy(&link_usettings, from, sizeof(link_usettings));
bitmap_to_arr32(link_usettings.link_modes.supported,
from->link_modes.supported,
__ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -1508,7 +1508,7 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
if (eeprom.offset + eeprom.len > total_len)
return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER);
+ data = kzalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
@@ -1573,7 +1573,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER);
+ data = kzalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
@@ -1764,7 +1764,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
return -EFAULT;
test.len = test_len;
- data = kmalloc_array(test_len, sizeof(u64), GFP_USER);
+ data = kcalloc(test_len, sizeof(u64), GFP_USER);
if (!data)
return -ENOMEM;
@@ -2295,7 +2295,7 @@ static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
ret = ethtool_tunable_valid(&tuna);
if (ret)
return ret;
- data = kmalloc(tuna.len, GFP_USER);
+ data = kzalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
ret = ops->get_tunable(dev, &tuna, data);
@@ -2481,7 +2481,7 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
ret = ethtool_phy_tunable_valid(&tuna);
if (ret)
return ret;
- data = kmalloc(tuna.len, GFP_USER);
+ data = kzalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
mutex_lock(&phydev->lock);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bb11fc87bbae..83299a85480a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -300,7 +300,7 @@ jumped:
else
err = ops->action(rule, fl, flags, arg);
- if (!err && ops->suppress && ops->suppress(rule, arg))
+ if (!err && ops->suppress && ops->suppress(rule, flags, arg))
continue;
if (err != -EAGAIN) {
@@ -1138,7 +1138,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
{
struct net *net;
struct sk_buff *skb;
- int err = -ENOBUFS;
+ int err = -ENOMEM;
net = ops->fro_net;
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
diff --git a/net/core/filter.c b/net/core/filter.c
index c441f9961e91..eba96343c7af 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1475,7 +1475,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
- return ERR_PTR(-EFAULT);
+ return ERR_PTR(-EINVAL);
}
prog->len = fprog->len;
@@ -1668,7 +1668,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
return -EFAULT;
if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
@@ -1703,7 +1703,7 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
{
void *ptr;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
@@ -2516,6 +2516,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
if (unlikely(flags))
return -EINVAL;
+ if (unlikely(len == 0))
+ return 0;
+
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
@@ -2861,8 +2864,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
shinfo->gso_type |= SKB_GSO_TCPV6;
}
- /* Due to IPv6 header, MSS needs to be downgraded. */
- skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
@@ -2902,8 +2903,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
shinfo->gso_type |= SKB_GSO_TCPV4;
}
- /* Due to IPv4 header, MSS can be upgraded. */
- skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
@@ -3146,18 +3145,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
return 0;
}
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
-{
- return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- SKB_MAX_ALLOC;
-}
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
- u32 len_max = __bpf_skb_max_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
__be16 proto = skb->protocol;
bool shrink = len_diff < 0;
u32 off;
@@ -3237,7 +3232,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 min_len = __bpf_skb_min_len(skb);
int ret;
@@ -3313,7 +3308,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
@@ -3335,6 +3330,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
__skb_push(skb, head_room);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
}
return ret;
@@ -4255,12 +4251,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
switch (optname) {
case SO_RCVBUF:
val = min_t(u32, val, sysctl_rmem_max);
+ val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
WRITE_ONCE(sk->sk_rcvbuf,
max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
WRITE_ONCE(sk->sk_sndbuf,
max_t(int, val * 2, SOCK_MIN_SNDBUF));
@@ -4270,7 +4268,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
+ sk->sk_max_pacing_rate = (val == ~0U) ?
+ ~0UL : (unsigned int)val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
@@ -4879,6 +4878,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
{
struct net *net = dev_net(skb->dev);
int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
if (plen < sizeof(*params))
return -EINVAL;
@@ -4886,22 +4886,28 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
return -EINVAL;
+ if (params->tot_len)
+ check_mtu = true;
+
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
break;
#endif
}
- if (!rc) {
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
struct net_device *dev;
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
@@ -5307,10 +5313,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
ifindex, proto, netns_id, flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -5344,10 +5361,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -5818,24 +5846,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
if (!th->ack || th->rst || th->syn)
return -ENOENT;
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
if (tcp_synq_no_recent_overflow(sk))
return -ENOENT;
cookie = ntohl(th->ack_seq) - 1;
- switch (sk->sk_family) {
- case AF_INET:
- if (unlikely(iph_len < sizeof(struct iphdr)))
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
return -EINVAL;
ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
break;
#if IS_BUILTIN(CONFIG_IPV6)
- case AF_INET6:
+ case 6:
if (unlikely(iph_len < sizeof(struct ipv6hdr)))
return -EINVAL;
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
break;
#endif /* CONFIG_IPV6 */
@@ -6702,6 +6739,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
+ int field_size;
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
@@ -6713,7 +6751,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case offsetof(struct bpf_sock, family):
case offsetof(struct bpf_sock, type):
case offsetof(struct bpf_sock, protocol):
- case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
@@ -6721,6 +6758,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
}
return size == size_default;
@@ -6909,9 +6954,9 @@ void bpf_warn_invalid_xdp_action(u32 act)
{
const u32 act_max = XDP_REDIRECT;
- WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
- act > act_max ? "Illegal" : "Driver unsupported",
- act);
+ pr_warn_once("%s XDP return value %u, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act);
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
@@ -8355,6 +8400,27 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
si->src_reg, off);
break;
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
+ off = si->off;
+ off -= offsetof(struct __sk_buff, cb[0]);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct sk_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ break;
+
+
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
target_size);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e3bdd859c895..4dac27c98623 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -811,8 +811,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -1023,11 +1025,16 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV4_ADDRS,
target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -1044,9 +1051,6 @@ proto_again:
}
}
- __skb_flow_dissect_ipv4(skb, flow_dissector,
- target_container, data, iph);
-
break;
}
case htons(ETH_P_IPV6): {
@@ -1068,8 +1072,10 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &iph->saddr,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -1143,6 +1149,7 @@ proto_again:
VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
key_vlan->vlan_tpid = saved_vlan_tpid;
+ key_vlan->vlan_eth_type = proto;
}
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index bfe7bdd4c340..98c396769be9 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t)
u64 rate, brate;
est_fetch_counters(est, &b);
- brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
- brate -= (est->avbps >> est->ewma_log);
+ brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
- rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
- rate -= (est->avpps >> est->ewma_log);
+ rate = (u64)(b.packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
write_seqcount_begin(&est->seq);
est->avbps += brate;
@@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
est = kzalloc(sizeof(*est), GFP_KERNEL);
if (!est)
return -ENOBUFS;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index f153e0601838..35b0e39030da 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -150,7 +150,7 @@ static void linkwatch_do_dev(struct net_device *dev)
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
- if (dev->flags & IFF_UP && netif_device_present(dev)) {
+ if (dev->flags & IFF_UP) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
@@ -196,7 +196,8 @@ static void __linkwatch_run_queue(int urgent_only)
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
- if (urgent_only && !linkwatch_urgent_event(dev)) {
+ if (!netif_device_present(dev) ||
+ (urgent_only && !linkwatch_urgent_event(dev))) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 99a6de52b21d..bf270b6a99b4 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
{
int ret;
- /* Preempt disable is needed to protect per-cpu redirect_info between
- * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
- * access to maps strictly require a rcu_read_lock() for protection,
- * mixing with BH RCU lock doesn't work.
+ /* Preempt disable and BH disable are needed to protect per-cpu
+ * redirect_info between BPF prog and skb_do_redirect().
*/
preempt_disable();
+ local_bh_disable();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -78,6 +77,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
+ local_bh_enable();
preempt_enable();
return ret;
@@ -158,10 +158,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
return dst->lwtstate->orig_output(net, sk, skb);
}
-static int xmit_check_hhlen(struct sk_buff *skb)
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
{
- int hh_len = skb_dst(skb)->dev->hard_header_len;
-
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
@@ -273,6 +271,7 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
+ int hh_len = dst->dev->hard_header_len;
__be16 proto = skb->protocol;
int ret;
@@ -290,7 +289,7 @@ static int bpf_xmit(struct sk_buff *skb)
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
- ret = xmit_check_hhlen(skb);
+ ret = xmit_check_hhlen(skb, hh_len);
if (unlikely(ret))
return ret;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 2f9c0de533c7..0b64f015b3b0 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -190,6 +190,10 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla_entype) {
+ if (nla_len(nla_entype) < sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE");
+ return -EINVAL;
+ }
encap_type = nla_get_u16(nla_entype);
if (lwtunnel_valid_encap_type(encap_type,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 04953e5f2530..8b6140e67e7f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -132,6 +132,9 @@ static void neigh_update_gc_list(struct neighbour *n)
write_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
/* remove from the gc list if new state is permanent or if neighbor
* is externally learned; otherwise entry should be on the gc list
*/
@@ -148,6 +151,7 @@ static void neigh_update_gc_list(struct neighbour *n)
atomic_inc(&n->tbl->gc_entries);
}
+out:
write_unlock(&n->lock);
write_unlock_bh(&n->tbl->lock);
}
@@ -235,6 +239,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock(&n->lock);
if ((n->nud_state == NUD_FAILED) ||
+ (n->nud_state == NUD_NOARP) ||
+ (tbl->is_multicast &&
+ tbl->is_multicast(n->primary_key)) ||
time_after(tref, n->updated))
remove = true;
write_unlock(&n->lock);
@@ -373,7 +380,7 @@ EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev,
- bool exempt_from_gc)
+ u8 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -406,6 +413,7 @@ do_alloc:
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ n->flags = flags;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
timer_setup(&n->timer, neigh_timer_handler, 0);
@@ -569,19 +577,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
}
EXPORT_SYMBOL(neigh_lookup_nodev);
-static struct neighbour *___neigh_create(struct neigh_table *tbl,
- const void *pkey,
- struct net_device *dev,
- bool exempt_from_gc, bool want_ref)
+static struct neighbour *
+___neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, u8 flags,
+ bool exempt_from_gc, bool want_ref)
{
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);
- u32 hash_val;
- unsigned int key_len = tbl->key_len;
- int error;
+ u32 hash_val, key_len = tbl->key_len;
+ struct neighbour *n1, *rc, *n;
struct neigh_hash_table *nht;
+ int error;
+ n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
-
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
@@ -668,7 +675,7 @@ out_neigh_release:
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
- return ___neigh_create(tbl, pkey, dev, false, want_ref);
+ return ___neigh_create(tbl, pkey, dev, 0, false, want_ref);
}
EXPORT_SYMBOL(__neigh_create);
@@ -727,11 +734,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
ASSERT_RTNL();
- n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (!n)
goto out;
- n->protocol = 0;
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
@@ -1214,7 +1220,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
lladdr instead of overriding it
if it is different.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
-
+ NEIGH_UPDATE_F_USE means that the entry is user triggered.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
@@ -1242,15 +1248,22 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
old = neigh->nud_state;
err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
- (old & (NUD_NOARP | NUD_PERMANENT)))
- goto out;
if (neigh->dead) {
NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
+ new = old;
goto out;
}
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
ext_learn_change = neigh_update_ext_learned(neigh, flags, &notify);
+ if (flags & NEIGH_UPDATE_F_USE) {
+ new = old & ~NUD_PERMANENT;
+ neigh->nud_state = new;
+ err = 0;
+ goto out;
+ }
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
@@ -1376,7 +1389,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
* we can reinject the packet there.
*/
n2 = NULL;
- if (dst) {
+ if (dst && dst->obsolete != DST_OBSOLETE_DEAD) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
@@ -1937,7 +1950,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
ndm->ndm_flags & NTF_EXT_LEARNED;
- neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);
+ neigh = ___neigh_create(tbl, dst, dev,
+ ndm->ndm_flags & NTF_EXT_LEARNED,
+ exempt_from_gc, true);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
@@ -1956,22 +1971,20 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
if (protocol)
neigh->protocol = protocol;
-
if (ndm->ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
-
if (ndm->ndm_flags & NTF_ROUTER)
flags |= NEIGH_UPDATE_F_ISROUTER;
+ if (ndm->ndm_flags & NTF_USE)
+ flags |= NEIGH_UPDATE_F_USE;
- if (ndm->ndm_flags & NTF_USE) {
+ err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid, extack);
+ if (!err && ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
- } else
- err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
- NETLINK_CB(skb).portid, extack);
-
+ }
neigh_release(neigh);
-
out:
return err;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 36347933ec3a..61f5570645e3 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -182,12 +182,23 @@ static const struct seq_operations softnet_seq_ops = {
.show = softnet_seq_show,
};
-static void *ptype_get_idx(loff_t pos)
+static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
{
+ struct list_head *ptype_list = NULL;
struct packet_type *pt = NULL;
+ struct net_device *dev;
loff_t i = 0;
int t;
+ for_each_netdev_rcu(seq_file_net(seq), dev) {
+ ptype_list = &dev->ptype_all;
+ list_for_each_entry_rcu(pt, ptype_list, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+
list_for_each_entry_rcu(pt, &ptype_all, list) {
if (i == pos)
return pt;
@@ -208,22 +219,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
- return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net_device *dev;
struct packet_type *pt;
struct list_head *nxt;
int hash;
++*pos;
if (v == SEQ_START_TOKEN)
- return ptype_get_idx(0);
+ return ptype_get_idx(seq, 0);
pt = v;
nxt = pt->list.next;
+ if (pt->dev) {
+ if (nxt != &pt->dev->ptype_all)
+ goto found;
+
+ dev = pt->dev;
+ for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
+ if (!list_empty(&dev->ptype_all)) {
+ nxt = dev->ptype_all.next;
+ goto found;
+ }
+ }
+
+ nxt = ptype_all.next;
+ goto ptype_all;
+ }
+
if (pt->type == htons(ETH_P_ALL)) {
+ptype_all:
if (nxt != &ptype_all)
goto found;
hash = 0;
@@ -252,7 +281,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Type Device Function\n");
- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
if (pt->type == htons(ETH_P_ALL))
seq_puts(seq, "ALL ");
else
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2ebf9b252779..ad45f13a0370 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -174,6 +174,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier)
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
+ struct net_device *netdev = to_net_dev(dev);
+
+ /* The check is also done in change_carrier; this helps returning early
+ * without hitting the trylock/restart in netdev_store.
+ */
+ if (!netdev->netdev_ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+
return netdev_store(dev, attr, buf, len, change_carrier);
}
@@ -195,10 +203,16 @@ static ssize_t speed_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
if (!rtnl_trylock())
return restart_syscall();
- if (netif_running(netdev)) {
+ if (netif_running(netdev) && netif_device_present(netdev)) {
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
@@ -215,6 +229,12 @@ static ssize_t duplex_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -438,6 +458,14 @@ static ssize_t proto_down_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
+ struct net_device *netdev = to_net_dev(dev);
+
+ /* The check is also done in change_proto_down; this helps returning
+ * early without hitting the trylock/restart in netdev_store.
+ */
+ if (!netdev->netdev_ops->ndo_change_proto_down)
+ return -EOPNOTSUPP;
+
return netdev_store(dev, attr, buf, len, change_proto_down);
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
@@ -448,6 +476,12 @@ static ssize_t phys_port_id_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The check is also done in dev_get_phys_port_id; this helps returning
+ * early without hitting the trylock/restart below.
+ */
+ if (!netdev->netdev_ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -470,6 +504,13 @@ static ssize_t phys_port_name_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The checks are also done in dev_get_phys_port_name; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->netdev_ops->ndo_get_phys_port_name &&
+ !netdev->netdev_ops->ndo_get_devlink_port)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -492,6 +533,14 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The checks are also done in dev_get_phys_port_name; this helps
+ * returning early without hitting the trylock/restart below. This works
+ * because recurse is false when calling dev_get_port_parent_id.
+ */
+ if (!netdev->netdev_ops->ndo_get_port_parent_id &&
+ !netdev->netdev_ops->ndo_get_devlink_port)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -1097,6 +1146,12 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ /* The check is also done later; this helps returning early without
+ * hitting the trylock/restart below.
+ */
+ if (!dev->netdev_ops->ndo_set_tx_maxrate)
+ return -EOPNOTSUPP;
+
err = kstrtou32(buf, 10, &rate);
if (err < 0)
return err;
@@ -1235,8 +1290,8 @@ static const struct attribute_group dql_group = {
static ssize_t xps_cpus_show(struct netdev_queue *queue,
char *buf)
{
+ int cpu, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
- int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
@@ -1246,22 +1301,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
index = get_netdev_queue_index(queue);
+ if (!rtnl_trylock())
+ return restart_syscall();
+
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
- if (num_tc < 0)
- return -EINVAL;
+ if (num_tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
}
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
@@ -1284,9 +1348,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
}
rcu_read_unlock();
+ rtnl_unlock();
+
len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
}
static ssize_t xps_cpus_store(struct netdev_queue *queue,
@@ -1314,7 +1384,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
+ if (!rtnl_trylock()) {
+ free_cpumask_var(mask);
+ return restart_syscall();
+ }
+
err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
free_cpumask_var(mask);
@@ -1326,22 +1402,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
+ int j, len, ret, num_tc = 1, tc = 0;
struct net_device *dev = queue->dev;
struct xps_dev_maps *dev_maps;
unsigned long *mask, index;
- int j, len, num_tc = 1, tc = 0;
index = get_netdev_queue_index(queue);
+ if (!rtnl_trylock())
+ return restart_syscall();
+
if (dev->num_tc) {
num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
+ if (tc < 0) {
+ ret = -EINVAL;
+ goto err_rtnl_unlock;
+ }
}
mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
- if (!mask)
- return -ENOMEM;
+ if (!mask) {
+ ret = -ENOMEM;
+ goto err_rtnl_unlock;
+ }
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_rxqs_map);
@@ -1367,10 +1450,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
out_no_maps:
rcu_read_unlock();
+ rtnl_unlock();
+
len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
bitmap_free(mask);
return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+ rtnl_unlock();
+ return ret;
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1396,10 +1485,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
+ if (!rtnl_trylock()) {
+ bitmap_free(mask);
+ return restart_syscall();
+ }
+
cpus_read_lock();
err = __netif_set_xps_queue(dev, mask, index, true);
cpus_read_unlock();
+ rtnl_unlock();
+
bitmap_free(mask);
return err ? : len;
}
@@ -1565,6 +1661,9 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 39402840025e..62a972f04cef 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -168,8 +168,10 @@ static void ops_exit_list(const struct pernet_operations *ops,
{
struct net *net;
if (ops->exit) {
- list_for_each_entry(net, net_exit_list, exit_list)
+ list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
+ cond_resched();
+ }
}
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
@@ -211,9 +213,9 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
- * is set to true, thus the caller knows that the new id must be notified via
- * rtnl.
+/* Must be called from RCU-critical section or with nsid_lock held. If
+ * a new id is assigned, the bool alloc is set to true, thus the
+ * caller knows that the new id must be notified via rtnl.
*/
static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
@@ -237,7 +239,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
return NETNSA_NSID_NOT_ASSIGNED;
}
-/* should be called with nsid_lock held */
+/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(struct net *net, struct net *peer)
{
bool no = false;
@@ -281,9 +283,10 @@ int peernet2id(struct net *net, struct net *peer)
{
int id;
- spin_lock_bh(&net->nsid_lock);
+ rcu_read_lock();
id = __peernet2id(net, peer);
- spin_unlock_bh(&net->nsid_lock);
+ rcu_read_unlock();
+
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -479,7 +482,9 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0) {
put_userns:
+#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
+#endif
put_user_ns(user_ns);
net_drop_ns(net);
dec_ucounts:
@@ -611,7 +616,9 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
dec_net_namespaces(net->ucounts);
+#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
+#endif
put_user_ns(net->user_ns);
net_drop_ns(net);
}
@@ -643,6 +650,18 @@ void __put_net(struct net *net)
}
EXPORT_SYMBOL_GPL(__put_net);
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ return &get_net(container_of(ns, struct net, ns))->ns;
+}
+EXPORT_SYMBOL_GPL(get_net_ns);
+
struct net *get_net_ns_by_fd(int fd)
{
struct file *file;
@@ -950,6 +969,7 @@ struct rtnl_net_dump_cb {
int s_idx;
};
+/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
@@ -1034,19 +1054,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
goto end;
}
- spin_lock_bh(&net_cb.tgt_net->nsid_lock);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
- !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
- err = -EAGAIN;
- goto end;
- }
+ rcu_read_lock();
idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net))
- spin_unlock_bh(&net_cb.ref_net->nsid_lock);
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
+ rcu_read_unlock();
cb->args[0] = net_cb.idx;
end:
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index cb67d36f3adb..78bbb912e502 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -29,6 +29,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/if_vlan.h>
+#include <net/dsa.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/addrconf.h>
@@ -637,15 +638,15 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
- struct net_device *ndev = NULL;
+ struct net_device *ndev = NULL, *dev = NULL;
+ struct net *net = current->nsproxy->net_ns;
struct in_device *in_dev;
int err;
rtnl_lock();
- if (np->dev_name[0]) {
- struct net *net = current->nsproxy->net_ns;
+ if (np->dev_name[0])
ndev = __dev_get_by_name(net, np->dev_name);
- }
+
if (!ndev) {
np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
err = -ENODEV;
@@ -653,6 +654,19 @@ int netpoll_setup(struct netpoll *np)
}
dev_hold(ndev);
+ /* bring up DSA management network devices up first */
+ for_each_netdev(net, dev) {
+ if (!netdev_uses_dsa(dev))
+ continue;
+
+ err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
+ if (err < 0) {
+ np_err(np, "%s failed to open %s\n",
+ np->dev_name, dev->name);
+ goto put;
+ }
+ }
+
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
err = -EBUSY;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dfc2501c35d9..335f68eaaa05 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -157,7 +157,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
put_page(page);
return NULL;
}
- page->dma_addr = dma;
+ page_pool_set_dma_addr(page, dma);
skip_dma_map:
/* Track how many pages are held 'in-flight' */
@@ -216,12 +216,12 @@ static void __page_pool_clean_page(struct page_pool *pool,
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
goto skip_dma_unmap;
- dma = page->dma_addr;
+ dma = page_pool_get_dma_addr(page);
/* DMA unmap */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
- page->dma_addr = 0;
+ page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index cb3b565ff5ad..1d20dd70879b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3465,7 +3465,7 @@ static int pktgen_thread_worker(void *arg)
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- BUG_ON(smp_processor_id() != cpu);
+ WARN_ON(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b0c06a063776..dbc9b2f53649 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2414,6 +2414,7 @@ static int do_setlink(const struct sk_buff *skb,
return err;
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
+ const char *pat = ifname && ifname[0] ? ifname : NULL;
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
if (IS_ERR(net)) {
@@ -2421,7 +2422,7 @@ static int do_setlink(const struct sk_buff *skb,
goto errout;
}
- err = dev_change_net_namespace(dev, net, ifname);
+ err = dev_change_net_namespace(dev, net, pat);
put_net(net);
if (err)
goto errout;
@@ -2471,7 +2472,7 @@ static int do_setlink(const struct sk_buff *skb,
sa->sa_family = dev->type;
memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
dev->addr_len);
- err = dev_set_mac_address(dev, sa, extack);
+ err = dev_set_mac_address_user(dev, sa, extack);
kfree(sa);
if (err)
goto errout;
@@ -3021,8 +3022,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
unsigned char name_assign_type = NET_NAME_USER;
struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
- const struct rtnl_link_ops *m_ops = NULL;
- struct net_device *master_dev = NULL;
+ const struct rtnl_link_ops *m_ops;
+ struct net_device *master_dev;
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
struct nlattr *tb[IFLA_MAX + 1];
@@ -3062,6 +3063,8 @@ replay:
dev = NULL;
}
+ master_dev = NULL;
+ m_ops = NULL;
if (dev) {
master_dev = netdev_master_upper_dev_get(dev);
if (master_dev)
@@ -3728,7 +3731,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -3839,7 +3842,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -4065,13 +4068,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
br_dev = netdev_master_upper_dev_get(dev);
cops = br_dev->netdev_ops;
}
} else {
if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ !netif_is_bridge_port(dev))
continue;
if (br_dev != netdev_master_upper_dev_get(dev) &&
@@ -4083,7 +4086,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
if (cops && cops->ndo_fdb_dump) {
err = cops->ndo_fdb_dump(skb, cb,
br_dev, dev,
@@ -4233,7 +4236,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (dev) {
if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
- if (!(dev->priv_flags & IFF_BRIDGE_PORT)) {
+ if (!netif_is_bridge_port(dev)) {
NL_SET_ERR_MSG(extack, "Device is not a bridge port");
return -EINVAL;
}
@@ -4535,6 +4538,10 @@ static int rtnl_bridge_notify(struct net_device *dev)
if (err < 0)
goto errout;
+ /* Notification info is only filled for bridge ports, not the bridge
+ * device itself. Therefore, a zero notification length is valid and
+ * should not result in an error.
+ */
if (!skb->len)
goto errout;
@@ -4945,7 +4952,7 @@ nla_put_failure:
static size_t if_nlmsg_stats_size(const struct net_device *dev,
u32 filter_mask)
{
- size_t size = 0;
+ size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 7b6b1d2c3d10..a1867c65ac63 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -23,6 +23,8 @@
static siphash_key_t net_secret __read_mostly;
static siphash_key_t ts_secret __read_mostly;
+#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
+
static __always_inline void net_secret_init(void)
{
net_get_random_once(&net_secret, sizeof(net_secret));
@@ -95,17 +97,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
}
EXPORT_SYMBOL(secure_tcpv6_seq);
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
const struct {
struct in6_addr saddr;
struct in6_addr daddr;
+ unsigned int timeseed;
__be16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.daddr = *(struct in6_addr *)daddr,
- .dport = dport
+ .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ .dport = dport,
};
net_secret_init();
return siphash(&combined, offsetofend(typeof(combined), dport),
@@ -143,11 +147,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
}
EXPORT_SYMBOL_GPL(secure_tcp_seq);
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
net_secret_init();
- return siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u16)dport, &net_secret);
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport,
+ jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 08d9915d50c0..5bdb3cd20d61 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -431,7 +431,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
len += NET_SKB_PAD;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -496,13 +500,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
void *data;
len += NET_SKB_PAD + NET_IP_ALIGN;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -510,6 +518,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
goto skb_success;
}
+ nc = this_cpu_ptr(&napi_alloc_cache);
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
len = SKB_DATA_ALIGN(len);
@@ -759,7 +768,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
if (dev)
- printk("%sdev name=%s feat=0x%pNF\n",
+ printk("%sdev name=%s feat=%pNF\n",
level, dev->name, &dev->features);
if (sk)
printk("%ssk family=%hu type=%u proto=%u\n",
@@ -2017,6 +2026,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
skb->csum = csum_block_sub(skb->csum,
skb_checksum(skb, len, delta, 0),
len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
}
return __pskb_trim(skb, len);
}
@@ -2124,7 +2139,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -2906,8 +2921,11 @@ skb_zerocopy_headlen(const struct sk_buff *from)
if (!from->head_frag ||
skb_headlen(from) < L1_CACHE_BYTES ||
- skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
if (skb_has_frag_list(from))
hlen = from->len;
@@ -3275,7 +3293,19 @@ EXPORT_SYMBOL(skb_split);
*/
static int skb_prepare_for_shift(struct sk_buff *skb)
{
- return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ int ret = 0;
+
+ if (skb_cloned(skb)) {
+ /* Save and restore truesize: pskb_expand_head() may reallocate
+ * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
+ * cannot change truesize at this point.
+ */
+ unsigned int save_truesize = skb->truesize;
+
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ skb->truesize = save_truesize;
+ }
+ return ret;
}
/**
@@ -4452,7 +4482,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
if (icmp_next)
- sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
}
spin_unlock_irqrestore(&q->lock, flags);
@@ -5515,7 +5545,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
lse->label_stack_entry = mpls_lse;
skb_postpush_rcsum(skb, lse, MPLS_HLEN);
- if (ethernet)
+ if (ethernet && mac_len >= ETH_HLEN)
skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
skb->protocol = mpls_proto;
@@ -5555,7 +5585,7 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
- if (ethernet) {
+ if (ethernet && mac_len >= ETH_HLEN) {
struct ethhdr *hdr;
/* use mpls_hdr() to get ethertype to account for VLANs. */
@@ -5618,6 +5648,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb)
if (unlikely(!eth_p_mpls(skb->protocol)))
return -EINVAL;
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
if (!--ttl)
@@ -5813,7 +5846,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb,
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 118cf1ace43a..a606ad8e8be2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -27,6 +27,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
int elem_first_coalesce)
{
struct page_frag *pfrag = sk_page_frag(sk);
+ u32 osize = msg->sg.size;
int ret = 0;
len -= msg->sg.size;
@@ -35,13 +36,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
u32 orig_offset;
int use, i;
- if (!sk_page_frag_refill(sk, pfrag))
- return -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
orig_offset = pfrag->offset;
use = min_t(int, len, pfrag->size - orig_offset);
- if (!sk_wmem_schedule(sk, use))
- return -ENOMEM;
+ if (!sk_wmem_schedule(sk, use)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
i = msg->sg.end;
sk_msg_iter_var_prev(i);
@@ -71,6 +76,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
}
return ret;
+
+msg_trim:
+ sk_msg_trim(sk, msg, osize);
+ return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_alloc);
@@ -170,10 +179,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
struct scatterlist *sge = sk_msg_elem(msg, i);
u32 len = sge->length;
- if (charge)
- sk_mem_uncharge(sk, len);
- if (!msg->skb)
+ /* When the skb owns the memory we free it from consume_skb path. */
+ if (!msg->skb) {
+ if (charge)
+ sk_mem_uncharge(sk, len);
put_page(sg_page(sge));
+ }
memset(sge, 0, sizeof(*sge));
return len;
}
@@ -397,28 +408,38 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
-static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+ struct sk_buff *skb)
{
- struct sock *sk = psock->sk;
- int copied = 0, num_sge;
struct sk_msg *msg;
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return NULL;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return NULL;
+
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
if (unlikely(!msg))
- return -EAGAIN;
- if (!sk_rmem_schedule(sk, skb, skb->len)) {
- kfree(msg);
- return -EAGAIN;
- }
+ return NULL;
sk_msg_init(msg);
- num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+ return msg;
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+ struct sk_psock *psock,
+ struct sock *sk,
+ struct sk_msg *msg)
+{
+ int num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+ int copied;
+
if (unlikely(num_sge < 0)) {
kfree(msg);
return num_sge;
}
- sk_mem_charge(sk, skb->len);
copied = skb->len;
msg->sg.start = 0;
msg->sg.size = copied;
@@ -430,6 +451,40 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
return copied;
}
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sk;
+ struct sk_msg *msg;
+
+ msg = sk_psock_create_ingress_msg(sk, skb);
+ if (!msg)
+ return -EAGAIN;
+
+ /* This will transition ownership of the data from the socket where
+ * the BPF program was run initiating the redirect to the socket
+ * we will eventually receive this data on. The data will be released
+ * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+ * into user buffers.
+ */
+ skb_set_owner_r(skb, sk);
+ return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ struct sock *sk = psock->sk;
+
+ if (unlikely(!msg))
+ return -EAGAIN;
+ sk_msg_init(msg);
+ return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+}
+
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
@@ -753,7 +808,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
+ struct tcp_skb_cb *tcp;
struct sock *sk_other;
+ int err = -EIO;
switch (verdict) {
case __SK_PASS:
@@ -762,16 +819,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
goto out_free;
}
- if (atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf) {
- struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
+ tcp = TCP_SKB_CB(skb);
+ tcp->bpf.flags |= BPF_F_INGRESS;
+
+ /* If the queue is empty then we can submit directly
+ * into the msg queue. If its not empty we have to
+ * queue work otherwise we may get OOO data. Otherwise,
+ * if sk_psock_skb_ingress errors will be handled by
+ * retrying later from workqueue.
+ */
+ if (skb_queue_empty(&psock->ingress_skb)) {
+ err = sk_psock_skb_ingress_self(psock, skb);
+ }
+ if (err < 0) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
- break;
}
- goto out_free;
+ break;
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 919f1a1739e9..c84f68bff7f5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -923,13 +923,10 @@ set_rcvbuf:
} else {
sock_reset_flag(sk, SOCK_RCVTSTAMP);
sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
}
break;
case SO_TIMESTAMPING_NEW:
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- /* fall through */
case SO_TIMESTAMPING_OLD:
if (val & ~SOF_TIMESTAMPING_MASK) {
ret = -EINVAL;
@@ -958,16 +955,14 @@ set_rcvbuf:
}
sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
- else {
- if (optname == SO_TIMESTAMPING_NEW)
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
+ else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- }
break;
case SO_RCVLOWAT:
@@ -1103,14 +1098,14 @@ set_rcvbuf:
if (val < 0)
ret = -EINVAL;
else
- sk->sk_ll_usec = val;
+ WRITE_ONCE(sk->sk_ll_usec, val);
}
break;
#endif
case SO_MAX_PACING_RATE:
{
- unsigned long ulval = (val == ~0U) ? ~0UL : val;
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
@@ -1186,6 +1181,16 @@ set_rcvbuf:
}
EXPORT_SYMBOL(sock_setsockopt);
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
+
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
+}
static void cred_to_ucred(struct pid *pid, const struct cred *cred,
struct ucred *ucred)
@@ -1360,7 +1365,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct ucred peercred;
if (len > sizeof(peercred))
len = sizeof(peercred);
+
+ spin_lock(&sk->sk_peer_lock);
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+ spin_unlock(&sk->sk_peer_lock);
+
if (copy_to_user(optval, &peercred, len))
return -EFAULT;
goto lenout;
@@ -1368,20 +1377,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_PEERGROUPS:
{
+ const struct cred *cred;
int ret, n;
- if (!sk->sk_peer_cred)
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
return -ENODATA;
- n = sk->sk_peer_cred->group_info->ngroups;
+ n = cred->group_info->ngroups;
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
+ put_cred(cred);
return put_user(len, optlen) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval,
- sk->sk_peer_cred->group_info);
+ ret = groups_to_user((gid_t __user *)optval, cred->group_info);
+ put_cred(cred);
if (ret)
return ret;
goto lenout;
@@ -1719,9 +1731,10 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_frag.page = NULL;
}
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
+
if (likely(sk->sk_net_refcnt))
put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
@@ -2031,16 +2044,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (can_skb_orphan_partial(skb)) {
- struct sock *sk = skb->sk;
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
- skb_orphan(skb);
- }
+ skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);
@@ -2348,8 +2355,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
@@ -2926,6 +2931,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
+
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index df52061f99f7..2646e8f98f67 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (err)
goto free_stab;
- stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+ stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
if (stab->sks)
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 40829111fe00..273c921751e4 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -302,7 +302,7 @@ select_by_hash:
i = j = reciprocal_scale(hash, socks);
while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
i++;
- if (i >= reuse->num_socks)
+ if (i >= socks)
i = 0;
if (i == j)
goto out;
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..a166a32b411f 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -195,9 +195,6 @@ void sk_stream_kill_queues(struct sock *sk)
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
- /* Next, the error queue. */
- __skb_queue_purge(&sk->sk_error_queue);
-
/* Next, the write queue. */
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 669cbe1609d9..48041f50ecfb 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -424,7 +424,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0600,
.proc_handler = proc_dolongvec_minmax_bpf_restricted,
.extra1 = &long_one,
- .extra2 = &long_max,
+ .extra2 = &bpf_jit_limit_max,
},
#endif
{