aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-02-21 11:59:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-02-21 11:59:51 -0800
commit3dc55dba67231fc22352483f5ca737df96cdc1e6 (patch)
tree5eec1f5ce42822cb744313ec394b761fab0f7ed6 /kernel
parentMerge branch 'akpm' (patches from Andrew) (diff)
parentMerge branch 'bnxt_en-shutdown-and-kexec-kdump-related-fixes' (diff)
downloadwireguard-linux-3dc55dba67231fc22352483f5ca737df96cdc1e6.tar.xz
wireguard-linux-3dc55dba67231fc22352483f5ca737df96cdc1e6.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from David Miller: 1) Limit xt_hashlimit hash table size to avoid OOM or hung tasks, from Cong Wang. 2) Fix deadlock in xsk by publishing global consumer pointers when NAPI is finished, from Magnus Karlsson. 3) Set table field properly to RT_TABLE_COMPAT when necessary, from Jethro Beekman. 4) NLA_STRING attributes are not necessary NULL terminated, deal wiht that in IFLA_ALT_IFNAME. From Eric Dumazet. 5) Fix checksum handling in atlantic driver, from Dmitry Bezrukov. 6) Handle mtu==0 devices properly in wireguard, from Jason A. Donenfeld. 7) Fix several lockdep warnings in bonding, from Taehee Yoo. 8) Fix cls_flower port blocking, from Jason Baron. 9) Sanitize internal map names in libbpf, from Toke Høiland-Jørgensen. 10) Fix RDMA race in qede driver, from Michal Kalderon. 11) Fix several false lockdep warnings by adding conditions to list_for_each_entry_rcu(), from Madhuparna Bhowmik. 12) Fix sleep in atomic in mlx5 driver, from Huy Nguyen. 13) Fix potential deadlock in bpf_map_do_batch(), from Yonghong Song. 14) Hey, variables declared in switch statement before any case statements are not initialized. I learn something every day. Get rids of this stuff in several parts of the networking, from Kees Cook. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (99 commits) bnxt_en: Issue PCIe FLR in kdump kernel to cleanup pending DMAs. bnxt_en: Improve device shutdown method. net: netlink: cap max groups which will be considered in netlink_bind() net: thunderx: workaround BGX TX Underflow issue ionic: fix fw_status read net: disable BRIDGE_NETFILTER by default net: macb: Properly handle phylink on at91rm9200 s390/qeth: fix off-by-one in RX copybreak check s390/qeth: don't warn for napi with 0 budget s390/qeth: vnicc Fix EOPNOTSUPP precedence openvswitch: Distribute switch variables for initialization net: ip6_gre: Distribute switch variables for initialization net: core: Distribute switch variables for initialization udp: rehash on disconnect net/tls: Fix to avoid gettig invalid tls record bpf: Fix a potential deadlock with bpf_map_do_batch bpf: Do not grab the bucket spinlock by default on htab batch ops ice: Wait for VF to be reset/ready before configuration ice: Don't tell the OS that link is going down ice: Don't reject odd values of usecs set by user ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c6
-rw-r--r--kernel/bpf/hashtab.c58
-rw-r--r--kernel/bpf/offload.c2
3 files changed, 57 insertions, 9 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 805c43b083e9..787140095e58 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
* EFAULT - verifier bug
* 0 - 99% match. The last 1% is validated by the verifier.
*/
-int btf_check_func_type_match(struct bpf_verifier_log *log,
- struct btf *btf1, const struct btf_type *t1,
- struct btf *btf2, const struct btf_type *t2)
+static int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
{
const struct btf_param *args1, *args2;
const char *fn1, *fn2, *s1, *s2;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2d182c4ee9d9..a1468e3f5af2 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -56,6 +56,7 @@ struct htab_elem {
union {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
+ struct htab_elem *batch_flink;
};
};
};
@@ -126,6 +127,17 @@ free_elems:
bpf_map_area_free(htab->elems);
}
+/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
+ * (bucket_lock). If both locks need to be acquired together, the lock
+ * order is always lru_lock -> bucket_lock and this only happens in
+ * bpf_lru_list.c logic. For example, certain code path of
+ * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
+ * will acquire lru_lock first followed by acquiring bucket_lock.
+ *
+ * In hashtab.c, to avoid deadlock, lock acquisition of
+ * bucket_lock followed by lru_lock is not allowed. In such cases,
+ * bucket_lock needs to be released first before acquiring lru_lock.
+ */
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash)
{
@@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size;
+ struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
- unsigned long flags;
+ unsigned long flags = 0;
+ bool locked = false;
struct htab_elem *l;
struct bucket *b;
int ret = 0;
@@ -1319,15 +1333,25 @@ again_nocopy:
dst_val = values;
b = &htab->buckets[batch];
head = &b->head;
- raw_spin_lock_irqsave(&b->lock, flags);
+ /* do not grab the lock unless need it (bucket_cnt > 0). */
+ if (locked)
+ raw_spin_lock_irqsave(&b->lock, flags);
bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
bucket_cnt++;
+ if (bucket_cnt && !locked) {
+ locked = true;
+ goto again_nocopy;
+ }
+
if (bucket_cnt > (max_count - total)) {
if (total == 0)
ret = -ENOSPC;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
@@ -1337,6 +1361,9 @@ again_nocopy:
if (bucket_cnt > bucket_size) {
bucket_size = bucket_cnt;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
@@ -1346,6 +1373,10 @@ again_nocopy:
goto alloc;
}
+ /* Next block is only safe to run if you have grabbed the lock */
+ if (!locked)
+ goto next_batch;
+
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
memcpy(dst_key, l->key, key_size);
@@ -1370,16 +1401,33 @@ again_nocopy:
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
- if (is_lru_map)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
- else
+
+ /* bpf_lru_push_free() will acquire lru_lock, which
+ * may cause deadlock. See comments in function
+ * prealloc_lru_pop(). Let us do bpf_lru_push_free()
+ * after releasing the bucket lock.
+ */
+ if (is_lru_map) {
+ l->batch_flink = node_to_free;
+ node_to_free = l;
+ } else {
free_htab_elem(htab, l);
+ }
}
dst_key += key_size;
dst_val += value_size;
}
raw_spin_unlock_irqrestore(&b->lock, flags);
+ locked = false;
+
+ while (node_to_free) {
+ l = node_to_free;
+ node_to_free = node_to_free->batch_flink;
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ }
+
+next_batch:
/* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu.
*/
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2c5dc6541ece..bd09290e3648 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
ulen = info->jited_prog_len;
info->jited_prog_len = aux->offload->jited_len;
- if (info->jited_prog_len & ulen) {
+ if (info->jited_prog_len && ulen) {
uinsns = u64_to_user_ptr(info->jited_prog_insns);
ulen = min_t(u32, info->jited_prog_len, ulen);
if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {