From d3dcf8eb615537526bd42ff27a081d46d337816e Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 4 Mar 2018 17:29:48 +0200 Subject: rhashtable: Fix rhlist duplicates insertion When inserting duplicate objects (those with the same key), current rhlist implementation messes up the chain pointers by updating the bucket pointer instead of prev next pointer to the newly inserted node. This causes missing elements on removal and travesal. Fix that by properly updating pprev pointer to point to the correct rhash_head next pointer. Issue: 1241076 Change-Id: I86b2c140bcb4aeb10b70a72a267ff590bb2b17e7 Fixes: ca26893f05e8 ('rhashtable: Add rhlist interface') Signed-off-by: Paul Blakey Acked-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3825c30aaa36..47de025b6245 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -506,8 +506,10 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head)))) + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; continue; + } if (!ht->rhlist) return rht_obj(ht, head); -- cgit v1.2.3-59-g8ed1b From 499ac3b60f657dae82055fc81c7b01e6242ac9bc Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 4 Mar 2018 17:29:49 +0200 Subject: test_rhashtable: add test case for rhltable with duplicate objects Tries to insert duplicates in the middle of bucket's chain: bucket 1: [[val 21 (tid=1)]] -> [[ val 1 (tid=2), val 1 (tid=0) ]] Reuses tid to distinguish the elements insertion order. Signed-off-by: Paul Blakey Acked-by: Herbert Xu Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) (limited to 'lib') diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 76d3667fdea2..f4000c137dbe 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -79,6 +79,21 @@ struct thread_data { struct test_obj *objs; }; +static u32 my_hashfn(const void *data, u32 len, u32 seed) +{ + const struct test_obj_rhl *obj = data; + + return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE; +} + +static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct test_obj_rhl *test_obj = obj; + const struct test_obj_val *val = arg->key; + + return test_obj->value.id - val->id; +} + static struct rhashtable_params test_rht_params = { .head_offset = offsetof(struct test_obj, node), .key_offset = offsetof(struct test_obj, value), @@ -87,6 +102,17 @@ static struct rhashtable_params test_rht_params = { .nulls_base = (3U << RHT_BASE_SHIFT), }; +static struct rhashtable_params test_rht_params_dup = { + .head_offset = offsetof(struct test_obj_rhl, list_node), + .key_offset = offsetof(struct test_obj_rhl, value), + .key_len = sizeof(struct test_obj_val), + .hashfn = jhash, + .obj_hashfn = my_hashfn, + .obj_cmpfn = my_cmpfn, + .nelem_hint = 128, + .automatic_shrinking = false, +}; + static struct semaphore prestart_sem; static struct semaphore startup_sem = __SEMAPHORE_INITIALIZER(startup_sem, 0); @@ -465,6 +491,112 @@ static int __init test_rhashtable_max(struct test_obj *array, return err; } +static unsigned int __init print_ht(struct rhltable *rhlt) +{ + struct rhashtable *ht; + const struct bucket_table *tbl; + char buff[512] = ""; + unsigned int i, cnt = 0; + + ht = &rhlt->ht; + tbl = rht_dereference(ht->tbl, ht); + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + struct test_obj_rhl *p; + + pos = rht_dereference(tbl->buckets[i], ht); + next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; + + if (!rht_is_a_nulls(pos)) { + sprintf(buff, "%s\nbucket[%d] -> ", buff, i); + } + + while (!rht_is_a_nulls(pos)) { + struct rhlist_head *list = container_of(pos, struct rhlist_head, rhead); + sprintf(buff, "%s[[", buff); + do { + pos = &list->rhead; + list = rht_dereference(list->next, ht); + p = rht_obj(ht, pos); + + sprintf(buff, "%s val %d (tid=%d)%s", buff, p->value.id, p->value.tid, + list? ", " : " "); + cnt++; + } while (list); + + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + + sprintf(buff, "%s]]%s", buff, !rht_is_a_nulls(pos) ? " -> " : ""); + } + } + printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff); + + return cnt; +} + +static int __init test_insert_dup(struct test_obj_rhl *rhl_test_objects, + int cnt, bool slow) +{ + struct rhltable rhlt; + unsigned int i, ret; + const char *key; + int err = 0; + + err = rhltable_init(&rhlt, &test_rht_params_dup); + if (WARN_ON(err)) + return err; + + for (i = 0; i < cnt; i++) { + rhl_test_objects[i].value.tid = i; + key = rht_obj(&rhlt.ht, &rhl_test_objects[i].list_node.rhead); + key += test_rht_params_dup.key_offset; + + if (slow) { + err = PTR_ERR(rhashtable_insert_slow(&rhlt.ht, key, + &rhl_test_objects[i].list_node.rhead)); + if (err == -EAGAIN) + err = 0; + } else + err = rhltable_insert(&rhlt, + &rhl_test_objects[i].list_node, + test_rht_params_dup); + if (WARN(err, "error %d on element %d/%d (%s)\n", err, i, cnt, slow? "slow" : "fast")) + goto skip_print; + } + + ret = print_ht(&rhlt); + WARN(ret != cnt, "missing rhltable elements (%d != %d, %s)\n", ret, cnt, slow? "slow" : "fast"); + +skip_print: + rhltable_destroy(&rhlt); + + return 0; +} + +static int __init test_insert_duplicates_run(void) +{ + struct test_obj_rhl rhl_test_objects[3] = {}; + + pr_info("test inserting duplicates\n"); + + /* two different values that map to same bucket */ + rhl_test_objects[0].value.id = 1; + rhl_test_objects[1].value.id = 21; + + /* and another duplicate with same as [0] value + * which will be second on the bucket list */ + rhl_test_objects[2].value.id = rhl_test_objects[0].value.id; + + test_insert_dup(rhl_test_objects, 2, false); + test_insert_dup(rhl_test_objects, 3, false); + test_insert_dup(rhl_test_objects, 2, true); + test_insert_dup(rhl_test_objects, 3, true); + + return 0; +} + static int thread_lookup_test(struct thread_data *tdata) { unsigned int entries = tdata->entries; @@ -613,6 +745,8 @@ static int __init test_rht_init(void) do_div(total_time, runs); pr_info("Average test time: %llu\n", total_time); + test_insert_duplicates_run(); + if (!tcount) return 0; -- cgit v1.2.3-59-g8ed1b From 8df3aaaf9b5f8bdfc4036695fa22f35b45b4d92f Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 13 Mar 2018 11:36:49 -0700 Subject: btree: avoid variable-length allocations geo->keylen cannot be larger than 4. So we might as well make fixed-size allocations. Given the one remaining user, geo->keylen cannot even be larger than 1. Logfs used to have 64bit and 128bit keys, tcm_qla2xxx only has 32bit keys. But let's not break the code if we don't have to. Signed-off-by: Joern Engel Signed-off-by: Linus Torvalds --- lib/btree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/btree.c b/lib/btree.c index f93a945274af..590facba2c50 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -3,7 +3,7 @@ * * As should be obvious for Linux kernel code, license is GPLv2 * - * Copyright (c) 2007-2008 Joern Engel + * Copyright (c) 2007-2008 Joern Engel * Bits and pieces stolen from Peter Zijlstra's code, which is * Copyright 2007, Red Hat Inc. Peter Zijlstra * GPLv2 @@ -76,6 +76,8 @@ struct btree_geo btree_geo128 = { }; EXPORT_SYMBOL_GPL(btree_geo128); +#define MAX_KEYLEN (2 * LONG_PER_U64) + static struct kmem_cache *btree_cachep; void *btree_alloc(gfp_t gfp_mask, void *pool_data) @@ -313,7 +315,7 @@ void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, { int i, height; unsigned long *node, *oldnode; - unsigned long *retry_key = NULL, key[geo->keylen]; + unsigned long *retry_key = NULL, key[MAX_KEYLEN]; if (keyzero(geo, __key)) return NULL; @@ -639,8 +641,8 @@ EXPORT_SYMBOL_GPL(btree_remove); int btree_merge(struct btree_head *target, struct btree_head *victim, struct btree_geo *geo, gfp_t gfp) { - unsigned long key[geo->keylen]; - unsigned long dup[geo->keylen]; + unsigned long key[MAX_KEYLEN]; + unsigned long dup[MAX_KEYLEN]; void *val; int err; -- cgit v1.2.3-59-g8ed1b From b3a5d111994450909158929560906f2c1c6c1d85 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 12:45:12 -0700 Subject: percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods percpu_ref internally uses sched-RCU to implement the percpu -> atomic mode switching and the documentation suggested that this could be depended upon. This doesn't seem like a good idea. * percpu_ref uses sched-RCU which has different grace periods regular RCU. Users may combine percpu_ref with regular RCU usage and incorrectly believe that regular RCU grace periods are performed by percpu_ref. This can lead to, for example, use-after-free due to premature freeing. * percpu_ref has a grace period when switching from percpu to atomic mode. It doesn't have one between the last put and release. This distinction is subtle and can lead to surprising bugs. * percpu_ref allows starting in and switching to atomic mode manually for debugging and other purposes. This means that there may not be any grace periods from kill to release. This patch makes it clear that the grace periods are percpu_ref's internal implementation detail and can't be depended upon by the users. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Linus Torvalds Signed-off-by: Tejun Heo --- include/linux/percpu-refcount.h | 18 ++++++++++++------ lib/percpu-refcount.c | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 864d167a1073..009cdf3d65b6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -30,10 +30,14 @@ * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it - * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove - * the kioctx from the proccess's list of kioctxs - after that, there can't be - * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop - * the initial ref with percpu_ref_put(). + * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. + * After that, there can't be any new users of the kioctx (from lookup_ioctx()) + * and it's then safe to drop the initial ref with percpu_ref_put(). + * + * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() + * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't + * imply RCU grace periods of any kind and if a user wants to combine percpu_ref + * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped @@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref); * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * - * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the - * percpu counters and dropping the initial ref. + * Switches @ref into atomic mode before gathering up the percpu counters + * and dropping the initial ref. + * + * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 30e7dd88148b..9f96fa7bc000 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the * process of switching to atomic mode by percpu_ref_switch_to_atomic(). + * + * There are no implied RCU grace periods between kill and release. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) -- cgit v1.2.3-59-g8ed1b From 52fda36d63bfc8c8e8ae5eda8eb5ac6f52cd67ed Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 20 Mar 2018 09:58:51 -0300 Subject: test_bpf: Fix testing with CONFIG_BPF_JIT_ALWAYS_ON=y on other arches Function bpf_fill_maxinsns11 is designed to not be able to be JITed on x86_64. So, it fails when CONFIG_BPF_JIT_ALWAYS_ON=y, and commit 09584b406742 ("bpf: fix selftests/bpf test_kmod.sh failure when CONFIG_BPF_JIT_ALWAYS_ON=y") makes sure that failure is detected on that case. However, it does not fail on other architectures, which have a different JIT compiler design. So, test_bpf has started to fail to load on those. After this fix, test_bpf loads fine on both x86_64 and ppc64el. Fixes: 09584b406742 ("bpf: fix selftests/bpf test_kmod.sh failure when CONFIG_BPF_JIT_ALWAYS_ON=y") Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Yonghong Song Signed-off-by: Daniel Borkmann --- lib/test_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 2efb213716fa..3e9335493fe4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5467,7 +5467,7 @@ static struct bpf_test tests[] = { { "BPF_MAXINSNS: Jump, gap, jump, ...", { }, -#ifdef CONFIG_BPF_JIT_ALWAYS_ON +#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_X86) CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, #else CLASSIC | FLAG_NO_DATA, -- cgit v1.2.3-59-g8ed1b From b6bdb7517c3d3f41f20e5c2948d6bc3f8897394e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 22 Mar 2018 16:17:20 -0700 Subject: mm/vmalloc: add interfaces to free unmapped page table On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may create pud/pmd mappings. A kernel panic was observed on arm64 systems with Cortex-A75 in the following steps as described by Hanjun Guo. 1. ioremap a 4K size, valid page table will build, 2. iounmap it, pte0 will set to 0; 3. ioremap the same address with 2M size, pgd/pmd is unchanged, then set the a new value for pmd; 4. pte0 is leaked; 5. CPU may meet exception because the old pmd is still in TLB, which will lead to kernel panic. This panic is not reproducible on x86. INVLPG, called from iounmap, purges all levels of entries associated with purged address on x86. x86 still has memory leak. The patch changes the ioremap path to free unmapped page table(s) since doing so in the unmap path has the following issues: - The iounmap() path is shared with vunmap(). Since vmap() only supports pte mappings, making vunmap() to free a pte page is an overhead for regular vmap users as they do not need a pte page freed up. - Checking if all entries in a pte page are cleared in the unmap path is racy, and serializing this check is expensive. - The unmap path calls free_vmap_area_noflush() to do lazy TLB purges. Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB purge. Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which clear a given pud/pmd entry and free up a page for the lower level entries. This patch implements their stub functions on x86 and arm64, which work as workaround. [akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub] Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings") Reported-by: Lei Li Signed-off-by: Toshi Kani Cc: Catalin Marinas Cc: Wang Xuefeng Cc: Will Deacon Cc: Hanjun Guo Cc: Michal Hocko Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Matthew Wilcox Cc: Chintan Pandya Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 10 ++++++++++ arch/x86/mm/pgtable.c | 24 ++++++++++++++++++++++++ include/asm-generic/pgtable.h | 10 ++++++++++ lib/ioremap.c | 6 ++++-- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8c704f1e53c2..2dbb2c9f1ec1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } + +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 004abf9ebf12..1eed7ed518e6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -702,4 +702,28 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } + +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * + * Context: The pud range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * + * Context: The pmd range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2cfa3075d148..bfbb44a5ad38 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); +int pud_free_pmd_page(pud_t *pud); +int pmd_free_pte_page(pmd_t *pmd); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { @@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } +static inline int pud_free_pmd_page(pud_t *pud) +{ + return 0; +} +static inline int pmd_free_pte_page(pmd_t *pmd) +{ + return 0; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE diff --git a/lib/ioremap.c b/lib/ioremap.c index b808a390e4c3..54e5bbaa3200 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, if (ioremap_pmd_enabled() && ((next - addr) == PMD_SIZE) && - IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PMD_SIZE) && + pmd_free_pte_page(pmd)) { if (pmd_set_huge(pmd, phys_addr + addr, prot)) continue; } @@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, if (ioremap_pud_enabled() && ((next - addr) == PUD_SIZE) && - IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PUD_SIZE) && + pud_free_pmd_page(pud)) { if (pud_set_huge(pud, phys_addr + addr, prot)) continue; } -- cgit v1.2.3-59-g8ed1b