From ee1a84fdfeedfd7362e9a8a8f15fedc3482ade2d Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 30 Nov 2018 16:24:08 +0100 Subject: selinux: overhaul sidtab to fix bug and improve performance Before this patch, during a policy reload the sidtab would become frozen and trying to map a new context to SID would be unable to add a new entry to sidtab and fail with -ENOMEM. Such failures are usually propagated into userspace, which has no way of distignuishing them from actual allocation failures and thus doesn't handle them gracefully. Such situation can be triggered e.g. by the following reproducer: while true; do load_policy; echo -n .; sleep 0.1; done & for (( i = 0; i < 1024; i++ )); do runcon -l s0:c$i echo -n x || break # or: # chcon -l s0:c$i || break done This patch overhauls the sidtab so it doesn't need to be frozen during policy reload, thus solving the above problem. The new SID table leverages the fact that SIDs are allocated sequentially and are never invalidated and stores them in linear buckets indexed by a tree structure. This brings several advantages: 1. Fast SID -> context lookup - this lookup can now be done in logarithmic time complexity (usually in less than 4 array lookups) and can still be done safely without locking. 2. No need to re-search the whole table on reverse lookup miss - after acquiring the spinlock only the newly added entries need to be searched, which means that reverse lookups that end up inserting a new entry are now about twice as fast. 3. No need to freeze sidtab during policy reload - it is now possible to handle insertion of new entries even during sidtab conversion. The tree structure of the new sidtab is able to grow automatically to up to about 2^31 entries (at which point it should not have more than about 4 tree levels). The old sidtab had a theoretical capacity of almost 2^32 entries, but half of that is still more than enough since by that point the reverse table lookups would become unusably slow anyway... The number of entries per tree node is selected automatically so that each node fits into a single page, which should be the easiest size for kmalloc() to handle. Note that the cache for reverse lookup is preserved with equivalent logic. The only difference is that instead of storing pointers to the hash table nodes it stores just the indices of the cached entries. The new cache ensures that the indices are loaded/stored atomically, but it still has the drawback that concurrent cache updates may mess up the contents of the cache. Such situation however only reduces its effectivity, not the correctness of lookups. Tested by selinux-testsuite and thoroughly tortured by this simple stress test: ``` function rand_cat() { echo $(( $RANDOM % 1024 )) } function do_work() { while true; do echo -n "system_u:system_r:kernel_t:s0:c$(rand_cat),c$(rand_cat)" \ >/sys/fs/selinux/context 2>/dev/null || true done } do_work >/dev/null & do_work >/dev/null & do_work >/dev/null & while load_policy; do echo -n .; sleep 0.1; done kill %1 kill %2 kill %3 ``` Link: https://github.com/SELinuxProject/selinux-kernel/issues/38 Reported-by: Orion Poplawski Reported-by: Li Kun Signed-off-by: Ondrej Mosnacek Reviewed-by: Stephen Smalley [PM: most of sidtab.c merged by hand due to conflicts] [PM: checkpatch fixes in mls.c, services.c, sidtab.c] Signed-off-by: Paul Moore --- security/selinux/ss/sidtab.c | 563 ++++++++++++++++++++++++++----------------- 1 file changed, 346 insertions(+), 217 deletions(-) (limited to 'security/selinux/ss/sidtab.c') diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index e44e7cec630c..e63a90ff2728 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -2,88 +2,41 @@ /* * Implementation of the SID table type. * - * Author : Stephen Smalley, + * Original author: Stephen Smalley, + * Author: Ondrej Mosnacek, + * + * Copyright (C) 2018 Red Hat, Inc. */ +#include #include #include +#include #include -#include +#include #include "flask.h" #include "security.h" #include "sidtab.h" -#define SIDTAB_HASH(sid) \ -(sid & SIDTAB_HASH_MASK) - int sidtab_init(struct sidtab *s) { - int i; + u32 i; - s->htable = kmalloc_array(SIDTAB_SIZE, sizeof(*s->htable), GFP_ATOMIC); - if (!s->htable) - return -ENOMEM; + memset(s->roots, 0, sizeof(s->roots)); + + for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) + atomic_set(&s->rcache[i], -1); for (i = 0; i < SECINITSID_NUM; i++) s->isids[i].set = 0; - for (i = 0; i < SIDTAB_SIZE; i++) - s->htable[i] = NULL; + atomic_set(&s->count, 0); - for (i = 0; i < SIDTAB_CACHE_LEN; i++) - s->cache[i] = NULL; + s->convert = NULL; - s->nel = 0; - s->next_sid = 0; - s->shutdown = 0; spin_lock_init(&s->lock); return 0; } -static int sidtab_insert(struct sidtab *s, u32 sid, struct context *context) -{ - int hvalue; - struct sidtab_node *prev, *cur, *newnode; - - if (!s) - return -ENOMEM; - - hvalue = SIDTAB_HASH(sid); - prev = NULL; - cur = s->htable[hvalue]; - while (cur && sid > cur->sid) { - prev = cur; - cur = cur->next; - } - - if (cur && sid == cur->sid) - return -EEXIST; - - newnode = kmalloc(sizeof(*newnode), GFP_ATOMIC); - if (!newnode) - return -ENOMEM; - - newnode->sid = sid; - if (context_cpy(&newnode->context, context)) { - kfree(newnode); - return -ENOMEM; - } - - if (prev) { - newnode->next = prev->next; - wmb(); - prev->next = newnode; - } else { - newnode->next = s->htable[hvalue]; - wmb(); - s->htable[hvalue] = newnode; - } - - s->nel++; - if (sid >= s->next_sid) - s->next_sid = sid + 1; - return 0; -} - int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context) { struct sidtab_isid_entry *entry; @@ -102,20 +55,90 @@ int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context) return 0; } -static struct context *sidtab_lookup(struct sidtab *s, u32 sid) +static u32 sidtab_level_from_count(u32 count) { - int hvalue; - struct sidtab_node *cur; + u32 capacity = SIDTAB_LEAF_ENTRIES; + u32 level = 0; + + while (count > capacity) { + capacity <<= SIDTAB_INNER_SHIFT; + ++level; + } + return level; +} + +static int sidtab_alloc_roots(struct sidtab *s, u32 level) +{ + u32 l; + + if (!s->roots[0].ptr_leaf) { + s->roots[0].ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_ATOMIC); + if (!s->roots[0].ptr_leaf) + return -ENOMEM; + } + for (l = 1; l <= level; ++l) + if (!s->roots[l].ptr_inner) { + s->roots[l].ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_ATOMIC); + if (!s->roots[l].ptr_inner) + return -ENOMEM; + s->roots[l].ptr_inner->entries[0] = s->roots[l - 1]; + } + return 0; +} + +static struct context *sidtab_do_lookup(struct sidtab *s, u32 index, int alloc) +{ + union sidtab_entry_inner *entry; + u32 level, capacity_shift, leaf_index = index / SIDTAB_LEAF_ENTRIES; + + /* find the level of the subtree we need */ + level = sidtab_level_from_count(index + 1); + capacity_shift = level * SIDTAB_INNER_SHIFT; + + /* allocate roots if needed */ + if (alloc && sidtab_alloc_roots(s, level) != 0) + return NULL; + + /* lookup inside the subtree */ + entry = &s->roots[level]; + while (level != 0) { + capacity_shift -= SIDTAB_INNER_SHIFT; + --level; + + entry = &entry->ptr_inner->entries[leaf_index >> capacity_shift]; + leaf_index &= ((u32)1 << capacity_shift) - 1; + + if (!entry->ptr_inner) { + if (alloc) + entry->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_ATOMIC); + if (!entry->ptr_inner) + return NULL; + } + } + if (!entry->ptr_leaf) { + if (alloc) + entry->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_ATOMIC); + if (!entry->ptr_leaf) + return NULL; + } + return &entry->ptr_leaf->entries[index % SIDTAB_LEAF_ENTRIES].context; +} - hvalue = SIDTAB_HASH(sid); - cur = s->htable[hvalue]; - while (cur && sid > cur->sid) - cur = cur->next; +static struct context *sidtab_lookup(struct sidtab *s, u32 index) +{ + u32 count = (u32)atomic_read(&s->count); - if (!cur || sid != cur->sid) + if (index >= count) return NULL; - return &cur->context; + /* read entries after reading count */ + smp_rmb(); + + return sidtab_do_lookup(s, index, 0); } static struct context *sidtab_lookup_initial(struct sidtab *s, u32 sid) @@ -127,9 +150,6 @@ static struct context *sidtab_search_core(struct sidtab *s, u32 sid, int force) { struct context *context; - if (!s) - return NULL; - if (sid != 0) { if (sid > SECINITSID_NUM) context = sidtab_lookup(s, sid - (SECINITSID_NUM + 1)); @@ -152,102 +172,69 @@ struct context *sidtab_search_force(struct sidtab *s, u32 sid) return sidtab_search_core(s, sid, 1); } -static int sidtab_map(struct sidtab *s, - int (*apply)(u32 sid, - struct context *context, - void *args), - void *args) +static int sidtab_find_context(union sidtab_entry_inner entry, + u32 *pos, u32 count, u32 level, + struct context *context, u32 *index) { - int i, rc = 0; - struct sidtab_node *cur; + int rc; + u32 i; - if (!s) - goto out; + if (level != 0) { + struct sidtab_node_inner *node = entry.ptr_inner; - for (i = 0; i < SIDTAB_SIZE; i++) { - cur = s->htable[i]; - while (cur) { - rc = apply(cur->sid, &cur->context, args); - if (rc) - goto out; - cur = cur->next; + i = 0; + while (i < SIDTAB_INNER_ENTRIES && *pos < count) { + rc = sidtab_find_context(node->entries[i], + pos, count, level - 1, + context, index); + if (rc == 0) + return 0; + i++; + } + } else { + struct sidtab_node_leaf *node = entry.ptr_leaf; + + i = 0; + while (i < SIDTAB_LEAF_ENTRIES && *pos < count) { + if (context_cmp(&node->entries[i].context, context)) { + *index = *pos; + return 0; + } + (*pos)++; + i++; } } -out: - return rc; + return -ENOENT; } -/* Clone the SID into the new SID table. */ -static int clone_sid(u32 sid, struct context *context, void *arg) +static void sidtab_rcache_update(struct sidtab *s, u32 index, u32 pos) { - struct sidtab *s = arg; - return sidtab_insert(s, sid, context); + while (pos > 0) { + atomic_set(&s->rcache[pos], atomic_read(&s->rcache[pos - 1])); + --pos; + } + atomic_set(&s->rcache[0], (int)index); } -int sidtab_convert(struct sidtab *s, struct sidtab *news, - int (*convert)(u32 sid, - struct context *context, - void *args), - void *args) +static void sidtab_rcache_push(struct sidtab *s, u32 index) { - unsigned long flags; - int rc; - - spin_lock_irqsave(&s->lock, flags); - s->shutdown = 1; - spin_unlock_irqrestore(&s->lock, flags); - - rc = sidtab_map(s, clone_sid, news); - if (rc) - return rc; - - return sidtab_map(news, convert, args); + sidtab_rcache_update(s, index, SIDTAB_RCACHE_SIZE - 1); } -static void sidtab_update_cache(struct sidtab *s, struct sidtab_node *n, int loc) +static int sidtab_rcache_search(struct sidtab *s, struct context *context, + u32 *index) { - BUG_ON(loc >= SIDTAB_CACHE_LEN); + u32 i; - while (loc > 0) { - s->cache[loc] = s->cache[loc - 1]; - loc--; - } - s->cache[0] = n; -} + for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) { + int v = atomic_read(&s->rcache[i]); -static inline int sidtab_search_context(struct sidtab *s, - struct context *context, u32 *sid) -{ - int i; - struct sidtab_node *cur; - - for (i = 0; i < SIDTAB_SIZE; i++) { - cur = s->htable[i]; - while (cur) { - if (context_cmp(&cur->context, context)) { - sidtab_update_cache(s, cur, SIDTAB_CACHE_LEN - 1); - *sid = cur->sid; - return 0; - } - cur = cur->next; - } - } - return -ENOENT; -} + if (v < 0) + continue; -static inline int sidtab_search_cache(struct sidtab *s, struct context *context, - u32 *sid) -{ - int i; - struct sidtab_node *node; - - for (i = 0; i < SIDTAB_CACHE_LEN; i++) { - node = s->cache[i]; - if (unlikely(!node)) - return -ENOENT; - if (context_cmp(&node->context, context)) { - sidtab_update_cache(s, node, i); - *sid = node->sid; + if (context_cmp(sidtab_do_lookup(s, (u32)v, 0), context)) { + sidtab_rcache_update(s, (u32)v, i); + *index = (u32)v; return 0; } } @@ -255,38 +242,98 @@ static inline int sidtab_search_cache(struct sidtab *s, struct context *context, } static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, - u32 *sid) + u32 *index) { - int ret; unsigned long flags; + u32 count = (u32)atomic_read(&s->count); + u32 count_locked, level, pos; + struct sidtab_convert_params *convert; + struct context *dst, *dst_convert; + int rc; - ret = sidtab_search_cache(s, context, sid); - if (ret) - ret = sidtab_search_context(s, context, sid); - if (ret) { - spin_lock_irqsave(&s->lock, flags); - /* Rescan now that we hold the lock. */ - ret = sidtab_search_context(s, context, sid); - if (!ret) - goto unlock_out; - /* No SID exists for the context. Allocate a new one. */ - if (s->next_sid == (UINT_MAX - SECINITSID_NUM - 1) || - s->shutdown) { - ret = -ENOMEM; - goto unlock_out; + rc = sidtab_rcache_search(s, context, index); + if (rc == 0) + return 0; + + level = sidtab_level_from_count(count); + + /* read entries after reading count */ + smp_rmb(); + + pos = 0; + rc = sidtab_find_context(s->roots[level], &pos, count, level, + context, index); + if (rc == 0) { + sidtab_rcache_push(s, *index); + return 0; + } + + /* lock-free search failed: lock, re-search, and insert if not found */ + spin_lock_irqsave(&s->lock, flags); + + convert = s->convert; + count_locked = (u32)atomic_read(&s->count); + level = sidtab_level_from_count(count_locked); + + /* if count has changed before we acquired the lock, then catch up */ + while (count < count_locked) { + if (context_cmp(sidtab_do_lookup(s, count, 0), context)) { + sidtab_rcache_push(s, count); + *index = count; + rc = 0; + goto out_unlock; } - *sid = s->next_sid++; - if (context->len) - pr_info("SELinux: Context %s is not valid (left unmapped).\n", - context->str); - ret = sidtab_insert(s, *sid, context); - if (ret) - s->next_sid--; -unlock_out: - spin_unlock_irqrestore(&s->lock, flags); + ++count; + } + + /* insert context into new entry */ + rc = -ENOMEM; + dst = sidtab_do_lookup(s, count, 1); + if (!dst) + goto out_unlock; + + rc = context_cpy(dst, context); + if (rc) + goto out_unlock; + + /* + * if we are building a new sidtab, we need to convert the context + * and insert it there as well + */ + if (convert) { + rc = -ENOMEM; + dst_convert = sidtab_do_lookup(convert->target, count, 1); + if (!dst_convert) { + context_destroy(dst); + goto out_unlock; + } + + rc = convert->func(context, dst_convert, convert->args); + if (rc) { + context_destroy(dst); + goto out_unlock; + } + + /* at this point we know the insert won't fail */ + atomic_set(&convert->target->count, count + 1); } - return ret; + if (context->len) + pr_info("SELinux: Context %s is not valid (left unmapped).\n", + context->str); + + sidtab_rcache_push(s, count); + *index = count; + + /* write entries before writing new count */ + smp_wmb(); + + atomic_set(&s->count, count + 1); + + rc = 0; +out_unlock: + spin_unlock_irqrestore(&s->lock, flags); + return rc; } int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid) @@ -310,57 +357,139 @@ int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid) return 0; } -void sidtab_hash_eval(struct sidtab *h, char *tag) +static int sidtab_convert_tree(union sidtab_entry_inner *edst, + union sidtab_entry_inner *esrc, + u32 *pos, u32 count, u32 level, + struct sidtab_convert_params *convert) { - int i, chain_len, slots_used, max_chain_len; - struct sidtab_node *cur; - - slots_used = 0; - max_chain_len = 0; - for (i = 0; i < SIDTAB_SIZE; i++) { - cur = h->htable[i]; - if (cur) { - slots_used++; - chain_len = 0; - while (cur) { - chain_len++; - cur = cur->next; - } + int rc; + u32 i; - if (chain_len > max_chain_len) - max_chain_len = chain_len; + if (level != 0) { + if (!edst->ptr_inner) { + edst->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_KERNEL); + if (!edst->ptr_inner) + return -ENOMEM; } + i = 0; + while (i < SIDTAB_INNER_ENTRIES && *pos < count) { + rc = sidtab_convert_tree(&edst->ptr_inner->entries[i], + &esrc->ptr_inner->entries[i], + pos, count, level - 1, + convert); + if (rc) + return rc; + i++; + } + } else { + if (!edst->ptr_leaf) { + edst->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, + GFP_KERNEL); + if (!edst->ptr_leaf) + return -ENOMEM; + } + i = 0; + while (i < SIDTAB_LEAF_ENTRIES && *pos < count) { + rc = convert->func(&esrc->ptr_leaf->entries[i].context, + &edst->ptr_leaf->entries[i].context, + convert->args); + if (rc) + return rc; + (*pos)++; + i++; + } + cond_resched(); + } + return 0; +} + +int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) +{ + unsigned long flags; + u32 count, level, pos; + int rc; + + spin_lock_irqsave(&s->lock, flags); + + /* concurrent policy loads are not allowed */ + if (s->convert) { + spin_unlock_irqrestore(&s->lock, flags); + return -EBUSY; } - pr_debug("%s: %d entries and %d/%d buckets used, longest " - "chain length %d\n", tag, h->nel, slots_used, SIDTAB_SIZE, - max_chain_len); + count = (u32)atomic_read(&s->count); + level = sidtab_level_from_count(count); + + /* allocate last leaf in the new sidtab (to avoid race with + * live convert) + */ + rc = sidtab_do_lookup(params->target, count - 1, 1) ? 0 : -ENOMEM; + if (rc) { + spin_unlock_irqrestore(&s->lock, flags); + return rc; + } + + /* set count in case no new entries are added during conversion */ + atomic_set(¶ms->target->count, count); + + /* enable live convert of new entries */ + s->convert = params; + + /* we can safely do the rest of the conversion outside the lock */ + spin_unlock_irqrestore(&s->lock, flags); + + pr_info("SELinux: Converting %u SID table entries...\n", count); + + /* convert all entries not covered by live convert */ + pos = 0; + rc = sidtab_convert_tree(¶ms->target->roots[level], + &s->roots[level], &pos, count, level, params); + if (rc) { + /* we need to keep the old table - disable live convert */ + spin_lock_irqsave(&s->lock, flags); + s->convert = NULL; + spin_unlock_irqrestore(&s->lock, flags); + } + return rc; } -void sidtab_destroy(struct sidtab *s) +static void sidtab_destroy_tree(union sidtab_entry_inner entry, u32 level) { - int i; - struct sidtab_node *cur, *temp; + u32 i; + + if (level != 0) { + struct sidtab_node_inner *node = entry.ptr_inner; + + if (!node) + return; + + for (i = 0; i < SIDTAB_INNER_ENTRIES; i++) + sidtab_destroy_tree(node->entries[i], level - 1); + kfree(node); + } else { + struct sidtab_node_leaf *node = entry.ptr_leaf; - if (!s) - return; + if (!node) + return; + + for (i = 0; i < SIDTAB_LEAF_ENTRIES; i++) + context_destroy(&node->entries[i].context); + kfree(node); + } +} + +void sidtab_destroy(struct sidtab *s) +{ + u32 i, level; for (i = 0; i < SECINITSID_NUM; i++) if (s->isids[i].set) context_destroy(&s->isids[i].context); - for (i = 0; i < SIDTAB_SIZE; i++) { - cur = s->htable[i]; - while (cur) { - temp = cur; - cur = cur->next; - context_destroy(&temp->context); - kfree(temp); - } - s->htable[i] = NULL; - } - kfree(s->htable); - s->htable = NULL; - s->nel = 0; - s->next_sid = 1; + level = SIDTAB_MAX_LEVEL; + while (level && !s->roots[level].ptr_inner) + --level; + + sidtab_destroy_tree(s->roots[level], level); } -- cgit v1.2.3-59-g8ed1b