aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2018-02-21 08:21:41 -0800
committerJames Morris <jmorris@namei.org>2018-02-21 08:21:41 -0800
commita02633e9b13dcb9b1a656b08f81bc8ba2d4d2294 (patch)
tree3d5ef56eee3117cd61812759a255fa293d8044b8 /kernel
parentSync to v4.15-rc3 for security subsystem developers to work against. (diff)
parentLinux 4.16-rc2 (diff)
downloadlinux-dev-a02633e9b13dcb9b1a656b08f81bc8ba2d4d2294.tar.xz
linux-dev-a02633e9b13dcb9b1a656b08f81bc8ba2d4d2294.zip
Merge tag 'v4.16-rc2' into next-general
Sync to Linux 4.16-rc2 for developers to work against.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c20
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c104
-rw-r--r--kernel/bpf/cgroup.c15
-rw-r--r--kernel/bpf/core.c454
-rw-r--r--kernel/bpf/cpumap.c31
-rw-r--r--kernel/bpf/devmap.c8
-rw-r--r--kernel/bpf/disasm.c63
-rw-r--r--kernel/bpf/disasm.h29
-rw-r--r--kernel/bpf/hashtab.c105
-rw-r--r--kernel/bpf/inode.c90
-rw-r--r--kernel/bpf/lpm_trie.c98
-rw-r--r--kernel/bpf/offload.c430
-rw-r--r--kernel/bpf/sockmap.c208
-rw-r--r--kernel/bpf/stackmap.c34
-rw-r--r--kernel/bpf/syscall.c216
-rw-r--r--kernel/bpf/verifier.c1823
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/cpuset.c4
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/stat.c8
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config15
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/cpu.c12
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/callchain.c15
-rw-r--r--kernel/events/core.c116
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/events/uprobes.c12
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fail_function.c349
-rw-r--r--kernel/fork.c540
-rw-r--r--kernel/futex.c112
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/affinity.c30
-rw-r--r--kernel/irq/autoprobe.c2
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/debug.h19
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c149
-rw-r--r--kernel/irq/matrix.c20
-rw-r--r--kernel/irq/msi.c64
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/jump_label.c12
-rw-r--r--kernel/kallsyms.c46
-rw-r--r--kernel/kcov.c8
-rw-r--r--kernel/kprobes.c178
-rw-r--r--kernel/livepatch/core.c76
-rw-r--r--kernel/livepatch/transition.c116
-rw-r--r--kernel/livepatch/transition.h2
-rw-r--r--kernel/locking/lockdep.c741
-rw-r--r--kernel/locking/locktorture.c108
-rw-r--r--kernel/locking/qspinlock.c25
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/spinlock.c13
-rw-r--r--kernel/memremap.c174
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/padata.c1
-rw-r--r--kernel/pid.c35
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk/printk.c225
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/rcu/rcu.h27
-rw-r--r--kernel/rcu/rcuperf.c6
-rw-r--r--kernel/rcu/rcutorture.c12
-rw-r--r--kernel/rcu/srcutree.c109
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_plugin.h13
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/relay.c9
-rw-r--r--kernel/resource.c39
-rw-r--r--kernel/sched/autogroup.c5
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c195
-rw-r--r--kernel/sched/cpufreq_schedutil.c97
-rw-r--r--kernel/sched/deadline.c149
-rw-r--r--kernel/sched/fair.c144
-rw-r--r--kernel/sched/membarrier.c179
-rw-r--r--kernel/sched/rt.c40
-rw-r--r--kernel/sched/sched.h114
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/sched/topology.c13
-rw-r--r--kernel/seccomp.c108
-rw-r--r--kernel/signal.c354
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c40
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/hrtimer.c655
-rw-r--r--kernel/time/posix-clock.c8
-rw-r--r--kernel/time/posix-cpu-timers.c25
-rw-r--r--kernel/time/posix-timers.c31
-rw-r--r--kernel/time/tick-internal.h13
-rw-r--r--kernel/time/tick-sched.c34
-rw-r--r--kernel/time/timer.c127
-rw-r--r--kernel/torture.c39
-rw-r--r--kernel/trace/Kconfig12
-rw-r--r--kernel/trace/bpf_trace.c78
-rw-r--r--kernel/trace/ftrace.c32
-rw-r--r--kernel/trace/ring_buffer.c85
-rw-r--r--kernel/trace/trace.c123
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_events.c18
-rw-r--r--kernel/trace/trace_events_filter.c9
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/trace/trace_kprobe.c61
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_selftest_dynamic.c5
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/tracepoint.c9
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c113
133 files changed, 7099 insertions, 3744 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/async.c b/kernel/async.c
index 2cbd3dd5940d..a893d6170944 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -84,20 +84,24 @@ static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct list_head *pending;
+ struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
- if (domain)
- pending = &domain->pending;
- else
- pending = &async_global_pending;
+ if (domain) {
+ if (!list_empty(&domain->pending))
+ first = list_first_entry(&domain->pending,
+ struct async_entry, domain_list);
+ } else {
+ if (!list_empty(&async_global_pending))
+ first = list_first_entry(&async_global_pending,
+ struct async_entry, global_list);
+ }
- if (!list_empty(pending))
- ret = list_first_entry(pending, struct async_entry,
- domain_list)->cookie;
+ if (first)
+ ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e691da0b3bab..a713fd23ec88 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
+ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
endif
+endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..b1f66480135b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,34 +49,64 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
}
/* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+static int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_array *array;
- u64 array_size;
- u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 ||
attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
- return ERR_PTR(-E2BIG);
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
+ struct bpf_array *array;
+ u64 array_size, mask64;
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,14 +116,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
- array->map.map_type = attr->map_type;
- array->map.key_size = attr->key_size;
- array->map.value_size = attr->value_size;
- array->map.max_entries = attr->max_entries;
- array->map.map_flags = attr->map_flags;
- array->map.numa_node = numa_node;
+ bpf_map_init_from_attr(&array->map, attr);
array->elem_size = elem_size;
if (!percpu)
@@ -121,12 +148,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -135,7 +163,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +190,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +210,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
@@ -225,10 +258,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- memcpy(this_cpu_ptr(array->pptrs[index]),
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
- memcpy(array->value + array->elem_size * index,
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
value, map->value_size);
return 0;
}
@@ -262,7 +296,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
@@ -296,6 +330,7 @@ static void array_map_free(struct bpf_map *map)
}
const struct bpf_map_ops array_map_ops = {
+ .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -306,6 +341,7 @@ const struct bpf_map_ops array_map_ops = {
};
const struct bpf_map_ops percpu_array_map_ops = {
+ .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -314,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
{
/* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
- return ERR_PTR(-EINVAL);
- return array_map_alloc(attr);
+ return -EINVAL;
+ return array_map_alloc_check(attr);
}
static void fd_array_map_free(struct bpf_map *map)
@@ -443,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
}
const struct bpf_map_ops prog_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -530,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
}
const struct bpf_map_ops perf_event_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
}
const struct bpf_map_ops cgroup_array_map_ops = {
- .map_alloc = fd_array_map_alloc,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
@@ -579,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
if (IS_ERR(inner_map_meta))
return inner_map_meta;
- map = fd_array_map_alloc(attr);
+ map = array_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
@@ -613,6 +652,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
static u32 array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -621,7 +661,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
@@ -636,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
}
const struct bpf_map_ops array_of_maps_map_ops = {
+ .map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
.map_get_next_key = array_map_get_next_key,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b789ab78d28f..c1c0b60d3f2f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
+ const int size_default = sizeof(__u32);
+
if (type == BPF_WRITE)
return false;
@@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size,
/* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
- if (size != sizeof(__u32))
- return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ break;
+ default:
+ if (size != size_default)
+ return false;
+ }
return true;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 86b50aa26ee8..29ca9208dcfa 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
fp->aux->prog = fp;
+ fp->jit_requested = ebpf_jit_enabled();
INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
@@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
-{
- return BPF_CLASS(insn->code) == BPF_JMP &&
- /* Call and Exit are both special jumps with no
- * target inside the BPF instruction image.
- */
- BPF_OP(insn->code) != BPF_CALL &&
- BPF_OP(insn->code) != BPF_EXIT;
-}
-
static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
{
struct bpf_insn *insn = prog->insnsi;
u32 i, insn_cnt = prog->len;
+ bool pseudo_call;
+ u8 code;
+ int off;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (!bpf_is_jmp_and_has_target(insn))
+ code = insn->code;
+ if (BPF_CLASS(code) != BPF_JMP)
+ continue;
+ if (BPF_OP(code) == BPF_EXIT)
continue;
+ if (BPF_OP(code) == BPF_CALL) {
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ pseudo_call = true;
+ else
+ continue;
+ } else {
+ pseudo_call = false;
+ }
+ off = pseudo_call ? insn->imm : insn->off;
/* Adjust offset of jmps if we cross boundaries. */
- if (i < pos && i + insn->off + 1 > pos)
- insn->off += delta;
- else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
- insn->off -= delta;
+ if (i < pos && i + off + 1 > pos)
+ off += delta;
+ else if (i > pos + delta && i + off + 1 <= pos + delta)
+ off -= delta;
+
+ if (pseudo_call)
+ insn->imm = off;
+ else
+ insn->off = off;
}
}
@@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
}
#ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
+
static __always_inline void
bpf_get_prog_addr_region(const struct bpf_prog *prog,
unsigned long *symbol_start,
@@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;
-int bpf_jit_kallsyms __read_mostly;
-
static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
{
WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
bpf_prog_unlock_free(fp);
}
-int bpf_jit_harden __read_mostly;
-
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
struct bpf_insn *to_buff)
@@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled())
+ if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
i += insn_delta;
}
+ clone->blinded = 1;
return clone;
}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
* therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
+ * anyway later on, so do not let the compiler omit it. This also needs
+ * to go into kallsyms for correlation from e.g. bpftool, so naming
+ * must not change.
*/
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
@@ -767,6 +782,138 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+/* All UAPI available opcodes. */
+#define BPF_INSN_MAP(INSN_2, INSN_3) \
+ /* 32 bit ALU operations. */ \
+ /* Register based. */ \
+ INSN_3(ALU, ADD, X), \
+ INSN_3(ALU, SUB, X), \
+ INSN_3(ALU, AND, X), \
+ INSN_3(ALU, OR, X), \
+ INSN_3(ALU, LSH, X), \
+ INSN_3(ALU, RSH, X), \
+ INSN_3(ALU, XOR, X), \
+ INSN_3(ALU, MUL, X), \
+ INSN_3(ALU, MOV, X), \
+ INSN_3(ALU, DIV, X), \
+ INSN_3(ALU, MOD, X), \
+ INSN_2(ALU, NEG), \
+ INSN_3(ALU, END, TO_BE), \
+ INSN_3(ALU, END, TO_LE), \
+ /* Immediate based. */ \
+ INSN_3(ALU, ADD, K), \
+ INSN_3(ALU, SUB, K), \
+ INSN_3(ALU, AND, K), \
+ INSN_3(ALU, OR, K), \
+ INSN_3(ALU, LSH, K), \
+ INSN_3(ALU, RSH, K), \
+ INSN_3(ALU, XOR, K), \
+ INSN_3(ALU, MUL, K), \
+ INSN_3(ALU, MOV, K), \
+ INSN_3(ALU, DIV, K), \
+ INSN_3(ALU, MOD, K), \
+ /* 64 bit ALU operations. */ \
+ /* Register based. */ \
+ INSN_3(ALU64, ADD, X), \
+ INSN_3(ALU64, SUB, X), \
+ INSN_3(ALU64, AND, X), \
+ INSN_3(ALU64, OR, X), \
+ INSN_3(ALU64, LSH, X), \
+ INSN_3(ALU64, RSH, X), \
+ INSN_3(ALU64, XOR, X), \
+ INSN_3(ALU64, MUL, X), \
+ INSN_3(ALU64, MOV, X), \
+ INSN_3(ALU64, ARSH, X), \
+ INSN_3(ALU64, DIV, X), \
+ INSN_3(ALU64, MOD, X), \
+ INSN_2(ALU64, NEG), \
+ /* Immediate based. */ \
+ INSN_3(ALU64, ADD, K), \
+ INSN_3(ALU64, SUB, K), \
+ INSN_3(ALU64, AND, K), \
+ INSN_3(ALU64, OR, K), \
+ INSN_3(ALU64, LSH, K), \
+ INSN_3(ALU64, RSH, K), \
+ INSN_3(ALU64, XOR, K), \
+ INSN_3(ALU64, MUL, K), \
+ INSN_3(ALU64, MOV, K), \
+ INSN_3(ALU64, ARSH, K), \
+ INSN_3(ALU64, DIV, K), \
+ INSN_3(ALU64, MOD, K), \
+ /* Call instruction. */ \
+ INSN_2(JMP, CALL), \
+ /* Exit instruction. */ \
+ INSN_2(JMP, EXIT), \
+ /* Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP, JEQ, X), \
+ INSN_3(JMP, JNE, X), \
+ INSN_3(JMP, JGT, X), \
+ INSN_3(JMP, JLT, X), \
+ INSN_3(JMP, JGE, X), \
+ INSN_3(JMP, JLE, X), \
+ INSN_3(JMP, JSGT, X), \
+ INSN_3(JMP, JSLT, X), \
+ INSN_3(JMP, JSGE, X), \
+ INSN_3(JMP, JSLE, X), \
+ INSN_3(JMP, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP, JEQ, K), \
+ INSN_3(JMP, JNE, K), \
+ INSN_3(JMP, JGT, K), \
+ INSN_3(JMP, JLT, K), \
+ INSN_3(JMP, JGE, K), \
+ INSN_3(JMP, JLE, K), \
+ INSN_3(JMP, JSGT, K), \
+ INSN_3(JMP, JSLT, K), \
+ INSN_3(JMP, JSGE, K), \
+ INSN_3(JMP, JSLE, K), \
+ INSN_3(JMP, JSET, K), \
+ INSN_2(JMP, JA), \
+ /* Store instructions. */ \
+ /* Register based. */ \
+ INSN_3(STX, MEM, B), \
+ INSN_3(STX, MEM, H), \
+ INSN_3(STX, MEM, W), \
+ INSN_3(STX, MEM, DW), \
+ INSN_3(STX, XADD, W), \
+ INSN_3(STX, XADD, DW), \
+ /* Immediate based. */ \
+ INSN_3(ST, MEM, B), \
+ INSN_3(ST, MEM, H), \
+ INSN_3(ST, MEM, W), \
+ INSN_3(ST, MEM, DW), \
+ /* Load instructions. */ \
+ /* Register based. */ \
+ INSN_3(LDX, MEM, B), \
+ INSN_3(LDX, MEM, H), \
+ INSN_3(LDX, MEM, W), \
+ INSN_3(LDX, MEM, DW), \
+ /* Immediate based. */ \
+ INSN_3(LD, IMM, DW), \
+ /* Misc (old cBPF carry-over). */ \
+ INSN_3(LD, ABS, B), \
+ INSN_3(LD, ABS, H), \
+ INSN_3(LD, ABS, W), \
+ INSN_3(LD, IND, B), \
+ INSN_3(LD, IND, H), \
+ INSN_3(LD, IND, W)
+
+bool bpf_opcode_in_insntable(u8 code)
+{
+#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true
+#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
+ static const bool public_insntable[256] = {
+ [0 ... 255] = false,
+ /* Now overwrite non-defaults ... */
+ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+ };
+#undef BPF_INSN_3_TBL
+#undef BPF_INSN_2_TBL
+ return public_insntable[code];
+}
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -774,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
- u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
u64 tmp;
+#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
+#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
- /* 32 bit ALU operations */
- [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
- [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
- [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
- [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
- [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
- [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
- [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
- [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
- [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
- [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
- [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
- [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
- [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
- [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
- [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
- [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
- [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
- [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
- [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
- [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
- [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
- [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
- [BPF_ALU | BPF_NEG] = &&ALU_NEG,
- [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
- [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
- /* 64 bit ALU operations */
- [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
- [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
- [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
- [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
- [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
- [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
- [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
- [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
- [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
- [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
- [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
- [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
- [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
- [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
- [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
- [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
- [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
- [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
- [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
- [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
- [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
- [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
- [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
- [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
- [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
- /* Call instruction */
- [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
+ /* Non-UAPI available opcodes. */
+ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
- /* Jumps */
- [BPF_JMP | BPF_JA] = &&JMP_JA,
- [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
- [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
- [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
- [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
- [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
- [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
- [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
- [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
- [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
- [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
- [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
- [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
- [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
- [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
- [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
- [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
- [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
- [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
- [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
- [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
- [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
- [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
- /* Program return */
- [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
- /* Store instructions */
- [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
- [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
- [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
- [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
- [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
- [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
- [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
- [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
- [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
- [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
- /* Load instructions */
- [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
- [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
- [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
- [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
- [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
- [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
- [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
- [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
- [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
- [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
- [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
+#undef BPF_INSN_3_LBL
+#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;
void *ptr;
int off;
@@ -949,14 +999,10 @@ select_insn:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
div64_u64_rem(DST, SRC, &tmp);
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
CONT;
@@ -969,13 +1015,9 @@ select_insn:
DST = do_div(tmp, (u32) IMM);
CONT;
ALU64_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
DST = (u32) tmp;
@@ -1025,6 +1067,13 @@ select_insn:
BPF_R4, BPF_R5);
CONT;
+ JMP_CALL_ARGS:
+ BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
+ BPF_R3, BPF_R4,
+ BPF_R5,
+ insn + insn->off + 1);
+ CONT;
+
JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1279,8 +1328,14 @@ load_byte:
goto load_byte;
default_label:
- /* If we ever reach this, we have a bug somewhere. */
- WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ /* If we ever reach this, we have a bug somewhere. Die hard here
+ * instead of just returning 0; we could be somewhere in a subprog,
+ * so execution could continue otherwise which we do /not/ want.
+ *
+ * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
+ */
+ pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+ BUG_ON(1);
return 0;
}
STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
@@ -1297,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
return ___bpf_prog_run(regs, insn, stack); \
}
+#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
+#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
+static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
+ const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_REG]; \
+\
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ BPF_R1 = r1; \
+ BPF_R2 = r2; \
+ BPF_R3 = r3; \
+ BPF_R4 = r4; \
+ BPF_R5 = r5; \
+ return ___bpf_prog_run(regs, insn, stack); \
+}
+
#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
@@ -1308,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
+
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
static unsigned int (*interpreters[])(const void *ctx,
@@ -1316,10 +1392,43 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
+#undef PROG_NAME_LIST
+#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
+static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+#undef PROG_NAME_LIST
+
+void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
+{
+ stack_depth = max_t(u32, stack_depth, 1);
+ insn->off = (s16) insn->imm;
+ insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
+ __bpf_call_base_args;
+ insn->code = BPF_JMP | BPF_CALL_ARGS;
+}
+
+#else
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+ * is not working properly, so warn about it!
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+}
+#endif
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
+ if (fp->kprobe_override)
+ return false;
+
if (!array->owner_prog_type) {
/* There's no owner yet where we could check for
* compatibility.
@@ -1364,9 +1473,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+ fp->bpf_func = __bpf_prog_ret0_warn;
+#endif
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1376,6 +1489,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
+#endif
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
@@ -1457,23 +1576,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
__u32 __user *prog_ids, u32 cnt)
{
struct bpf_prog **prog;
- u32 i = 0, id;
-
+ unsigned long err = 0;
+ u32 i = 0, *ids;
+ bool nospc;
+
+ /* users of this function are doing:
+ * cnt = bpf_prog_array_length();
+ * if (cnt > 0)
+ * bpf_prog_array_copy_to_user(..., cnt);
+ * so below kcalloc doesn't need extra cnt > 0 check, but
+ * bpf_prog_array_length() releases rcu lock and
+ * prog array could have been swapped with empty or larger array,
+ * so always copy 'cnt' prog_ids to the user.
+ * In a rare race the user will see zero prog_ids
+ */
+ ids = kcalloc(cnt, sizeof(u32), GFP_USER);
+ if (!ids)
+ return -ENOMEM;
rcu_read_lock();
prog = rcu_dereference(progs)->progs;
for (; *prog; prog++) {
- id = (*prog)->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
- rcu_read_unlock();
- return -EFAULT;
- }
+ if (*prog == &dummy_bpf_prog.prog)
+ continue;
+ ids[i] = (*prog)->aux->id;
if (++i == cnt) {
prog++;
break;
}
}
+ nospc = !!(*prog);
rcu_read_unlock();
- if (*prog)
+ err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
+ kfree(ids);
+ if (err)
+ return -EFAULT;
+ if (nospc)
return -ENOSPC;
return 0;
}
@@ -1545,14 +1682,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
return 0;
}
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+ __u32 __user *prog_ids, u32 request_cnt,
+ __u32 __user *prog_cnt)
+{
+ u32 cnt = 0;
+
+ if (array)
+ cnt = bpf_prog_array_length(array);
+
+ if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+
+ /* return early if user requested only program count or nothing to copy */
+ if (!request_cnt || !cnt)
+ return 0;
+
+ return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
+ int i;
aux = container_of(work, struct bpf_prog_aux, work);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
- bpf_jit_free(aux->prog);
+ for (i = 0; i < aux->func_cnt; i++)
+ bpf_jit_free(aux->func[i]);
+ if (aux->func_cnt) {
+ kfree(aux->func);
+ bpf_prog_unlock_free(aux->prog);
+ } else {
+ bpf_jit_free(aux->prog);
+ }
}
/* Free internal BPF program */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
if (!cmap)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- cmap->map.map_type = attr->map_type;
- cmap->map.key_size = attr->key_size;
- cmap->map.value_size = attr->value_size;
- cmap->map.max_entries = attr->max_entries;
- cmap->map.map_flags = attr->map_flags;
- cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&cmap->map, attr);
/* Pre-limit array size based on NR_CPUS, not final CPU check */
if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
return ERR_PTR(err);
}
-void __cpu_map_queue_destructor(void *ptr)
+static void __cpu_map_queue_destructor(void *ptr)
{
/* The tear-down procedure should have made sure that queue is
* empty. See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
return xdp_pkt;
}
-struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_pkt *xdp_pkt)
+static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
{
unsigned int frame_size;
void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
+ int map_id)
{
gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
return NULL;
}
-void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct rcu_head *rcu)
{
struct bpf_cpu_map_entry *rcpu;
int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
* cpu_map_kthread_stop, which waits for an RCU graze period before
* stopping kthread, emptying the queue.
*/
-void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
- u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
{
struct bpf_cpu_map_entry *old_rcpu;
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static int cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
-void cpu_map_free(struct bpf_map *map)
+static void cpu_map_free(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- dtab->map.map_type = attr->map_type;
- dtab->map.key_size = attr->key_size;
- dtab->map.value_size = attr->value_size;
- dtab->map.max_entries = attr->max_entries;
- dtab->map.map_flags = attr->map_flags;
- dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&dtab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index e682850c9715..8740406df2cd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -21,10 +21,39 @@ static const char * const func_id_str[] = {
};
#undef __BPF_FUNC_STR_FN
-const char *func_id_name(int id)
+static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ char *buff, size_t len)
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+ if (insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
+ func_id_str[insn->imm])
+ return func_id_str[insn->imm];
+
+ if (cbs && cbs->cb_call)
+ return cbs->cb_call(cbs->private_data, insn);
+
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ snprintf(buff, len, "%+d", insn->imm);
+
+ return buff;
+}
+
+static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ u64 full_imm, char *buff, size_t len)
+{
+ if (cbs && cbs->cb_imm)
+ return cbs->cb_imm(cbs->private_data, insn, full_imm);
+
+ snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
+ return buff;
+}
+
+const char *func_id_name(int id)
+{
if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
return func_id_str[id];
else
@@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+static void print_bpf_end_insn(bpf_insn_print_t verbose,
struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
@@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose,
insn->imm, insn->dst_reg);
}
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
- const struct bpf_insn *insn, bool allow_ptr_leaks)
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ bool allow_ptr_leaks)
{
+ const bpf_insn_print_t verbose = cbs->cb_print;
u8 class = BPF_CLASS(insn->code);
if (class == BPF_ALU || class == BPF_ALU64) {
@@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
*/
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ char tmp[64];
if (map_ptr && !allow_ptr_leaks)
imm = 0;
- verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
+ verbose(env, "(%02x) r%d = %s\n",
+ insn->code, insn->dst_reg,
+ __func_imm_name(cbs, insn, imm,
+ tmp, sizeof(tmp)));
} else {
verbose(env, "BUG_ld_%02x\n", insn->code);
return;
@@ -189,8 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
- verbose(env, "(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
+ char tmp[64];
+
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
+ verbose(env, "(%02x) call pc%s\n",
+ insn->code,
+ __func_get_name(cbs, insn,
+ tmp, sizeof(tmp)));
+ } else {
+ strcpy(tmp, "unknown");
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ __func_get_name(cbs, insn,
+ tmp, sizeof(tmp)),
+ insn->imm);
+ }
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(env, "(%02x) goto pc%+d\n",
insn->code, insn->off);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 8de977e420b6..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -17,16 +17,35 @@
#include <linux/bpf.h>
#include <linux/kernel.h>
#include <linux/stringify.h>
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <string.h>
+#endif
+
+struct bpf_verifier_env;
extern const char *const bpf_alu_string[16];
extern const char *const bpf_class_string[8];
const char *func_id_name(int id);
-struct bpf_verifier_env;
-typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
- const char *, ...);
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
- const struct bpf_insn *insn, bool allow_ptr_leaks);
+typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+ const char *, ...);
+typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
+ const struct bpf_insn *insn);
+typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
+ const struct bpf_insn *insn,
+ __u64 full_imm);
+
+struct bpf_insn_cbs {
+ bpf_insn_print_t cb_print;
+ bpf_insn_revmap_call_t cb_call;
+ bpf_insn_print_imm_t cb_imm;
+ void *private_data;
+};
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ bool allow_ptr_leaks);
#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
htab->map.key_size);
free_percpu(pptr);
+ cond_resched();
}
free_elems:
bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
pptr);
+ cond_resched();
}
skip_percpu_elems:
@@ -225,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
}
/* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static int htab_map_alloc_check(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -239,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_htab *htab;
- int err, i;
- u64 cost;
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -252,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
*/
- return ERR_PTR(-EPERM);
+ return -EPERM;
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (!lru && percpu_lru)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (lru && !prealloc)
- return ERR_PTR(-ENOTSUPP);
+ return -ENOTSUPP;
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ if (attr->max_entries == 0 || attr->key_size == 0 ||
+ attr->value_size == 0)
+ return -EINVAL;
+
+ if (attr->key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ return -E2BIG;
+
+ if (attr->value_size >= KMALLOC_MAX_SIZE -
+ MAX_BPF_STACK - sizeof(struct htab_elem))
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements via bpf syscall. This check also makes
+ * sure that the elem_size doesn't overflow and it's
+ * kmalloc-able later in htab_map_update_elem()
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ struct bpf_htab *htab;
+ int err, i;
+ u64 cost;
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- htab->map.map_type = attr->map_type;
- htab->map.key_size = attr->key_size;
- htab->map.value_size = attr->value_size;
- htab->map.max_entries = attr->max_entries;
- htab->map.map_flags = attr->map_flags;
- htab->map.numa_node = numa_node;
-
- /* check sanity of attributes.
- * value_size == 0 may be allowed in the future to use map as a set
- */
- err = -EINVAL;
- if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
- htab->map.value_size == 0)
- goto free_htab;
+ bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
/* ensure each CPU's lru list has >=1 elements.
@@ -302,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
- err = -E2BIG;
- if (htab->map.key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- goto free_htab;
-
- if (htab->map.value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct htab_elem))
- /* if value_size is bigger, the user space won't be able to
- * access the elements via bpf syscall. This check also makes
- * sure that the elem_size doesn't overflow and it's
- * kmalloc-able later in htab_map_update_elem()
- */
- goto free_htab;
-
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)
@@ -325,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
+ err = -E2BIG;
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1141,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1151,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
};
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1234,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
}
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1243,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -1251,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
{
if (attr->value_size != sizeof(u32))
- return ERR_PTR(-EINVAL);
- return htab_map_alloc(attr);
+ return -EINVAL;
+ return htab_map_alloc_check(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1326,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
if (IS_ERR(inner_map_meta))
return inner_map_meta;
- map = fd_htab_map_alloc(attr);
+ map = htab_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
@@ -1370,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
}
const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc_check = fd_htab_map_alloc_check,
.map_alloc = htab_of_map_alloc,
.map_free = htab_of_map_free,
.map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..81e2f6995adb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
-static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
- umode_t mode, const struct inode_operations *iops)
+static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
+ const struct inode_operations *iops)
{
- struct inode *inode;
-
- inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = iops;
- inode->i_private = dentry->d_fsdata;
+ inode->i_private = raw;
bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
-static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t devt)
+static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
{
- enum bpf_type type = MINOR(devt);
-
- if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
- dentry->d_fsdata == NULL)
- return -EPERM;
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+}
- switch (type) {
- case BPF_TYPE_PROG:
- return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
- case BPF_TYPE_MAP:
- return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
- default:
- return -EPERM;
- }
+static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
+{
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
}
static struct dentry *
@@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry,
static const struct inode_operations bpf_dir_iops = {
.lookup = bpf_lookup,
- .mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.symlink = bpf_symlink,
.rmdir = simple_rmdir,
@@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
struct inode *dir;
struct path path;
umode_t mode;
- dev_t devt;
int ret;
dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
@@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
return PTR_ERR(dentry);
mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
- devt = MKDEV(UNNAMED_MAJOR, type);
- ret = security_path_mknod(&path, dentry, mode, devt);
+ ret = security_path_mknod(&path, dentry, mode, 0);
if (ret)
goto out;
@@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
goto out;
}
- dentry->d_fsdata = raw;
- ret = vfs_mknod(dir, dentry, mode, devt);
- dentry->d_fsdata = NULL;
+ switch (type) {
+ case BPF_TYPE_PROG:
+ ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
+ break;
+ case BPF_TYPE_MAP:
+ ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
+ break;
+ default:
+ ret = -EPERM;
+ }
out:
done_path_create(&path, dentry);
return ret;
@@ -368,7 +362,45 @@ out:
putname(pname);
return ret;
}
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!bpf_prog_get_ok(prog, &type, false))
+ return ERR_PTR(-EINVAL);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..7b469d10d0e9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
- trie->map.map_type = attr->map_type;
- trie->map.key_size = attr->key_size;
- trie->map.value_size = attr->value_size;
- trie->map.max_entries = attr->max_entries;
- trie->map.map_flags = attr->map_flags;
- trie->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&trie->map, attr);
trie->data_size = attr->key_size -
offsetof(struct bpf_lpm_trie_key, data);
trie->max_prefixlen = trie->data_size * 8;
@@ -596,9 +591,96 @@ unlock:
raw_spin_unlock(&trie->lock);
}
-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
{
- return -ENOTSUPP;
+ struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+ struct lpm_trie_node **node_stack = NULL;
+ int err = 0, stack_ptr = -1;
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /* The get_next_key follows postorder. For the 4 node example in
+ * the top of this file, the trie_get_next_key() returns the following
+ * one after another:
+ * 192.168.0.0/24
+ * 192.168.1.0/24
+ * 192.168.128.0/24
+ * 192.168.0.0/16
+ *
+ * The idea is to return more specific keys before less specific ones.
+ */
+
+ /* Empty trie */
+ search_root = rcu_dereference(trie->root);
+ if (!search_root)
+ return -ENOENT;
+
+ /* For invalid key, find the leftmost node in the trie */
+ if (!key || key->prefixlen > trie->max_prefixlen)
+ goto find_leftmost;
+
+ node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!node_stack)
+ return -ENOMEM;
+
+ /* Try to find the exact node for the given key */
+ for (node = search_root; node;) {
+ node_stack[++stack_ptr] = node;
+ matchlen = longest_prefix_match(trie, node, key);
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM))
+ goto find_leftmost;
+
+ /* The node with the exactly-matching key has been found,
+ * find the first node in postorder after the matched node.
+ */
+ node = node_stack[stack_ptr];
+ while (stack_ptr > 0) {
+ parent = node_stack[stack_ptr - 1];
+ if (rcu_dereference(parent->child[0]) == node) {
+ search_root = rcu_dereference(parent->child[1]);
+ if (search_root)
+ goto find_leftmost;
+ }
+ if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+ next_node = parent;
+ goto do_copy;
+ }
+
+ node = parent;
+ stack_ptr--;
+ }
+
+ /* did not find anything */
+ err = -ENOENT;
+ goto free_stack;
+
+find_leftmost:
+ /* Find the leftmost non-intermediate node, all intermediate nodes
+ * have exact two children, so this function will never return NULL.
+ */
+ for (node = search_root; node;) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ next_node = node;
+ node = rcu_dereference(node->child[0]);
+ }
+do_copy:
+ next_key->prefixlen = next_node->prefixlen;
+ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+ next_node->data, trie->data_size);
+free_stack:
+ kfree(node_stack);
+ return err;
}
const struct bpf_map_ops trie_map_ops = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8455b89d1bbf..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,18 +16,35 @@
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/bug.h>
+#include <linux/kdev_t.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
+#include <linux/proc_ns.h>
#include <linux/rtnetlink.h>
+#include <linux/rwsem.h>
-/* protected by RTNL */
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
+ * RTNL lock cannot be taken when holding this lock.
+ */
+static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
+
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+ if (!netdev)
+ return -EINVAL;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+ return 0;
+}
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
{
- struct net *net = current->nsproxy->net_ns;
- struct bpf_dev_offload *offload;
+ struct bpf_prog_offload *offload;
+ int err;
if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -41,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
return -ENOMEM;
offload->prog = prog;
- init_waitqueue_head(&offload->verifier_done);
- rtnl_lock();
- offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
- if (!offload->netdev) {
- rtnl_unlock();
- kfree(offload);
- return -EINVAL;
- }
+ offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
+ attr->prog_ifindex);
+ err = bpf_dev_offload_check(offload->netdev);
+ if (err)
+ goto err_maybe_put;
+ down_write(&bpf_devs_lock);
+ if (offload->netdev->reg_state != NETREG_REGISTERED) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
- rtnl_unlock();
+ dev_put(offload->netdev);
+ up_write(&bpf_devs_lock);
return 0;
+err_unlock:
+ up_write(&bpf_devs_lock);
+err_maybe_put:
+ if (offload->netdev)
+ dev_put(offload->netdev);
+ kfree(offload);
+ return err;
}
static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
struct netdev_bpf *data)
{
- struct net_device *netdev = prog->aux->offload->netdev;
+ struct bpf_prog_offload *offload = prog->aux->offload;
+ struct net_device *netdev;
ASSERT_RTNL();
- if (!netdev)
+ if (!offload)
return -ENODEV;
- if (!netdev->netdev_ops->ndo_bpf)
- return -EOPNOTSUPP;
+ netdev = offload->netdev;
data->command = cmd;
@@ -87,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
if (err)
goto exit_unlock;
- env->dev_ops = data.verifier.ops;
-
+ env->prog->aux->offload->dev_ops = data.verifier.ops;
env->prog->aux->offload->dev_state = true;
- env->prog->aux->offload->verifier_running = true;
exit_unlock:
rtnl_unlock();
return err;
}
+int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload)
+ ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ struct bpf_prog_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
- /* Caution - if netdev is destroyed before the program, this function
- * will be called twice.
- */
-
data.offload.prog = prog;
- if (offload->verifier_running)
- wait_event(offload->verifier_done, !offload->verifier_running);
-
if (offload->dev_state)
WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
- offload->dev_state = false;
+ /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+ bpf_prog_free_id(prog, true);
+
list_del_init(&offload->offloads);
- offload->netdev = NULL;
+ kfree(offload);
+ prog->aux->offload = NULL;
}
void bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
-
- offload->verifier_running = false;
- wake_up(&offload->verifier_done);
-
rtnl_lock();
- __bpf_prog_offload_destroy(prog);
+ down_write(&bpf_devs_lock);
+ if (prog->aux->offload)
+ __bpf_prog_offload_destroy(prog);
+ up_write(&bpf_devs_lock);
rtnl_unlock();
-
- kfree(offload);
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
int ret;
data.offload.prog = prog;
- offload->verifier_running = false;
- wake_up(&offload->verifier_done);
-
rtnl_lock();
ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
rtnl_unlock();
@@ -164,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
return bpf_prog_offload_translate(prog);
}
+struct ns_get_path_bpf_prog_args {
+ struct bpf_prog *prog;
+ struct bpf_prog_info *info;
+};
+
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+ struct ns_get_path_bpf_prog_args *args = private_data;
+ struct bpf_prog_aux *aux = args->prog->aux;
+ struct ns_common *ns;
+ struct net *net;
+
+ rtnl_lock();
+ down_read(&bpf_devs_lock);
+
+ if (aux->offload) {
+ args->info->ifindex = aux->offload->netdev->ifindex;
+ net = dev_net(aux->offload->netdev);
+ get_net(net);
+ ns = &net->ns;
+ } else {
+ args->info->ifindex = 0;
+ ns = NULL;
+ }
+
+ up_read(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return ns;
+}
+
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+ struct bpf_prog *prog)
+{
+ struct ns_get_path_bpf_prog_args args = {
+ .prog = prog,
+ .info = info,
+ };
+ struct bpf_prog_aux *aux = prog->aux;
+ struct inode *ns_inode;
+ struct path ns_path;
+ char __user *uinsns;
+ void *res;
+ u32 ulen;
+
+ res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+ if (IS_ERR(res)) {
+ if (!info->ifindex)
+ return -ENODEV;
+ return PTR_ERR(res);
+ }
+
+ down_read(&bpf_devs_lock);
+
+ if (!aux->offload) {
+ up_read(&bpf_devs_lock);
+ return -ENODEV;
+ }
+
+ ulen = info->jited_prog_len;
+ info->jited_prog_len = aux->offload->jited_len;
+ if (info->jited_prog_len & ulen) {
+ uinsns = u64_to_user_ptr(info->jited_prog_insns);
+ ulen = min_t(u32, info->jited_prog_len, ulen);
+ if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
+ up_read(&bpf_devs_lock);
+ return -EFAULT;
+ }
+ }
+
+ up_read(&bpf_devs_lock);
+
+ ns_inode = ns_path.dentry->d_inode;
+ info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ info->netns_ino = ns_inode->i_ino;
+ path_put(&ns_path);
+
+ return 0;
+}
+
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_offloaded_map *offmap;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_type != BPF_MAP_TYPE_HASH)
+ return ERR_PTR(-EINVAL);
+
+ offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ if (!offmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&offmap->map, attr);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+ err = bpf_dev_offload_check(offmap->netdev);
+ if (err)
+ goto err_unlock;
+
+ err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return &offmap->map;
+
+err_unlock:
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+ kfree(offmap);
+ return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map, true);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (offmap->netdev)
+ __bpf_map_offload_destroy(offmap);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+ flags);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_delete_elem(offmap, key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+struct ns_get_path_bpf_map_args {
+ struct bpf_offloaded_map *offmap;
+ struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+ struct ns_get_path_bpf_map_args *args = private_data;
+ struct ns_common *ns;
+ struct net *net;
+
+ rtnl_lock();
+ down_read(&bpf_devs_lock);
+
+ if (args->offmap->netdev) {
+ args->info->ifindex = args->offmap->netdev->ifindex;
+ net = dev_net(args->offmap->netdev);
+ get_net(net);
+ ns = &net->ns;
+ } else {
+ args->info->ifindex = 0;
+ ns = NULL;
+ }
+
+ up_read(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct ns_get_path_bpf_map_args args = {
+ .offmap = map_to_offmap(map),
+ .info = info,
+ };
+ struct inode *ns_inode;
+ struct path ns_path;
+ void *res;
+
+ res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+ if (IS_ERR(res)) {
+ if (!info->ifindex)
+ return -ENODEV;
+ return PTR_ERR(res);
+ }
+
+ ns_inode = ns_path.dentry->d_inode;
+ info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ info->netns_ino = ns_inode->i_ino;
+ path_put(&ns_path);
+
+ return 0;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap;
+ struct bpf_prog_offload *offload;
+ bool ret;
+
+ if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ offmap = map_to_offmap(map);
+
+ ret = offload && offload->netdev == offmap->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+ struct bpf_prog_offload *offload, *tmp;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+ struct bpf_offloaded_map *offmap, *tmp;
+
+ list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+ if (offmap->netdev == netdev)
+ __bpf_map_offload_destroy(offmap);
+}
+
static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
- struct bpf_dev_offload *offload, *tmp;
ASSERT_RTNL();
@@ -181,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier,
if (netdev->reg_state != NETREG_UNREGISTERING)
break;
- list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
- offloads) {
- if (offload->netdev == netdev)
- __bpf_prog_offload_destroy(offload->prog);
- }
+ down_write(&bpf_devs_lock);
+ bpf_offload_orphan_all_progs(netdev);
+ bpf_offload_orphan_all_maps(netdev);
+ up_write(&bpf_devs_lock);
break;
default:
break;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..48c33417d13c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
struct work_struct tx_work;
struct work_struct gc_work;
+ struct proto *sk_proto;
+ void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
- void (*save_state_change)(struct sock *sk);
};
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,78 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
return rcu_dereference_sk_user_data(sk);
}
-/* compute the linear packet data range [data, data_end) for skb when
- * sk_skb type programs are in use.
- */
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+static struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (unlikely(psock->sk_proto)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+
+ psock->save_close = sk->sk_prot->close;
+ psock->sk_proto = sk->sk_prot;
+ sk->sk_prot = &tcp_bpf_proto;
+ rcu_read_unlock();
+ return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+
+ if (likely(psock)) {
+ sk->sk_prot = psock->sk_proto;
+ psock->sk_proto = NULL;
+ }
+ rcu_read_unlock();
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
{
- TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+ void (*close_fun)(struct sock *sk, long timeout);
+ struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock *psock;
+ struct sock *osk;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ /* The psock may be destroyed anytime after exiting the RCU critial
+ * section so by the time we use close_fun the psock may no longer
+ * be valid. However, bpf_tcp_close is called with the sock lock
+ * held so the close hook and sk are still valid.
+ */
+ close_fun = psock->save_close;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ rcu_read_unlock();
+ close_fun(sk, timeout);
}
enum __sk_action {
@@ -110,6 +177,22 @@ enum __sk_action {
__SK_REDIRECT,
};
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+ .name = "bpf_tcp",
+ .uid = TCP_ULP_BPF,
+ .user_visible = false,
+ .owner = NULL,
+ .init = bpf_tcp_init,
+ .release = bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+ tcp_bpf_proto = tcp_prot;
+ tcp_bpf_proto.close = bpf_tcp_close;
+ return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
{
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -174,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
sk->sk_error_report(sk);
}
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
- struct smap_psock_map_entry *e, *tmp;
- struct smap_psock *psock;
- struct socket_wq *wq;
- struct sock *osk;
-
- rcu_read_lock();
-
- /* Allowing transitions into an established syn_recv states allows
- * for early binding sockets to a smap object before the connection
- * is established.
- */
- switch (sk->sk_state) {
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- case TCP_ESTABLISHED:
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSING:
- case TCP_LAST_ACK:
- case TCP_FIN_WAIT1:
- case TCP_FIN_WAIT2:
- case TCP_LISTEN:
- break;
- case TCP_CLOSE:
- /* Only release if the map entry is in fact the sock in
- * question. There is a case where the operator deletes
- * the sock from the map, but the TCP sock is closed before
- * the psock is detached. Use cmpxchg to verify correct
- * sock is removed.
- */
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- break;
- write_lock_bh(&sk->sk_callback_lock);
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- osk = cmpxchg(e->entry, sk, NULL);
- if (osk == sk) {
- list_del(&e->list);
- smap_release_sock(psock, sk);
- }
- }
- write_unlock_bh(&sk->sk_callback_lock);
- break;
- default:
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- break;
- smap_report_sk_error(psock, EPIPE);
- break;
- }
-
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible_all(&wq->wait);
- rcu_read_unlock();
-}
-
static void smap_read_sock_strparser(struct strparser *strp,
struct sk_buff *skb)
{
@@ -330,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
return;
sk->sk_data_ready = psock->save_data_ready;
sk->sk_write_space = psock->save_write_space;
- sk->sk_state_change = psock->save_state_change;
psock->save_data_ready = NULL;
psock->save_write_space = NULL;
- psock->save_state_change = NULL;
strp_stop(&psock->strp);
psock->strp_enabled = false;
}
@@ -358,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
if (psock->refcnt)
return;
+ tcp_cleanup_ulp(sock);
smap_stop_sock(psock, sock);
clear_bit(SMAP_TX_RUNNING, &psock->state);
rcu_assign_sk_user_data(sock, NULL);
@@ -435,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
return;
psock->save_data_ready = sk->sk_data_ready;
psock->save_write_space = sk->sk_write_space;
- psock->save_state_change = sk->sk_state_change;
sk->sk_data_ready = smap_data_ready;
sk->sk_write_space = smap_write_space;
- sk->sk_state_change = smap_state_change;
psock->strp_enabled = true;
}
@@ -517,17 +535,15 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (attr->value_size > KMALLOC_MAX_SIZE)
return ERR_PTR(-E2BIG);
+ err = bpf_tcp_ulp_register();
+ if (err && err != -EEXIST)
+ return ERR_PTR(err);
+
stab = kzalloc(sizeof(*stab), GFP_USER);
if (!stab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- stab->map.map_type = attr->map_type;
- stab->map.key_size = attr->key_size;
- stab->map.value_size = attr->value_size;
- stab->map.max_entries = attr->max_entries;
- stab->map.map_flags = attr->map_flags;
- stab->map.numa_node = bpf_map_attr_numa_node(attr);
+ bpf_map_init_from_attr(&stab->map, attr);
/* make sure page count doesn't overflow */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -591,17 +607,19 @@ static void sock_map_free(struct bpf_map *map)
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
- smap_list_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ }
write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
- if (stab->bpf_verdict)
- bpf_prog_put(stab->bpf_verdict);
- if (stab->bpf_parse)
- bpf_prog_put(stab->bpf_parse);
-
sock_map_remove_complete(stab);
}
@@ -761,6 +779,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto out_progs;
}
+ err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+ if (err)
+ goto out_progs;
+
set_bit(SMAP_TX_RUNNING, &psock->state);
}
@@ -873,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
}
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *orig;
+
+ orig = xchg(&stab->bpf_parse, NULL);
+ if (orig)
+ bpf_prog_put(orig);
+ orig = xchg(&stab->bpf_verdict, NULL);
+ if (orig)
+ bpf_prog_put(orig);
+}
+
const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -880,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
+ .map_release = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc636cc98..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
goto free_smap;
- smap->map.map_type = attr->map_type;
- smap->map.key_size = attr->key_size;
+ bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
- smap->map.max_entries = attr->max_entries;
- smap->map.map_flags = attr->map_flags;
smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- smap->map.numa_node = bpf_map_attr_numa_node(attr);
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
@@ -226,9 +222,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
return 0;
}
-static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int stack_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
{
- return -EINVAL;
+ struct bpf_stack_map *smap = container_of(map,
+ struct bpf_stack_map, map);
+ u32 id;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!key) {
+ id = 0;
+ } else {
+ id = *(u32 *)key;
+ if (id >= smap->n_buckets || !smap->buckets[id])
+ id = 0;
+ else
+ id++;
+ }
+
+ while (id < smap->n_buckets && !smap->buckets[id])
+ id++;
+
+ if (id >= smap->n_buckets)
+ return -ENOENT;
+
+ *(u32 *)next_key = id;
+ return 0;
}
static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..e24aa3241387 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
return 0;
}
+const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_alloc = bpf_map_offload_map_alloc,
+ .map_free = bpf_map_offload_map_free,
+};
+
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
struct bpf_map *map;
+ int err;
- if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
- !bpf_map_types[attr->map_type])
+ if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+ return ERR_PTR(-EINVAL);
+ ops = bpf_map_types[attr->map_type];
+ if (!ops)
return ERR_PTR(-EINVAL);
- map = bpf_map_types[attr->map_type]->map_alloc(attr);
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return ERR_PTR(err);
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
- map->ops = bpf_map_types[attr->map_type];
+ map->ops = ops;
map->map_type = attr->map_type;
return map;
}
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
kvfree(area);
}
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+ map->map_type = attr->map_type;
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ map->map_flags = attr->map_flags;
+ map->numa_node = bpf_map_attr_numa_node(attr);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
unsigned long flags;
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
+ map->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
goto free_value;
/* Need to create a kthread, thus must support schedule */
- if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+ goto out;
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}
@@ -669,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
- } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
- map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+ } else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
@@ -731,6 +768,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ goto out;
+ }
+
preempt_disable();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock();
@@ -738,7 +780,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
@@ -788,9 +830,15 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_get_next_key(map, key, next_key);
+ goto out;
+ }
+
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
+out:
if (err)
goto free_next_key;
@@ -905,9 +953,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
- /* cBPF to eBPF migrations are currently not in the idr store. */
+ /* cBPF to eBPF migrations are currently not in the idr store.
+ * Offloaded programs are removed from the store when their device
+ * disappears - even if someone grabs an fd to them they are unusable,
+ * simply waiting for refcnt to drop to be freed.
+ */
if (!prog->aux->id)
return;
@@ -917,6 +969,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
__acquire(&prog_idr_lock);
idr_remove(&prog_idr, prog->aux->id);
+ prog->aux->id = 0;
if (do_idr_lock)
spin_unlock_bh(&prog_idr_lock);
@@ -937,10 +990,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ int i;
+
trace_bpf_prog_put_rcu(prog);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
+
+ for (i = 0; i < prog->aux->func_cnt; i++)
+ bpf_prog_kallsyms_del(prog->aux->func[i]);
bpf_prog_kallsyms_del(prog);
+
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}
@@ -1057,7 +1116,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
enum bpf_prog_type *attach_type, bool attach_drv)
{
/* not an attachment, just a refcount inc, always allow */
@@ -1151,6 +1210,8 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
+ prog->aux->offload_requested = !!attr->prog_ifindex;
+
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
@@ -1172,7 +1233,7 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
- if (attr->prog_ifindex) {
+ if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
@@ -1194,7 +1255,8 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_used_maps;
/* eBPF program is ready to be JITed */
- prog = bpf_prog_select_runtime(prog, &err);
+ if (!prog->bpf_func)
+ prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
@@ -1439,6 +1501,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -1551,6 +1615,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
+static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
+ unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++)
+ if (prog->aux->used_maps[i] == (void *)addr)
+ return prog->aux->used_maps[i];
+ return NULL;
+}
+
+static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
+{
+ const struct bpf_map *map;
+ struct bpf_insn *insns;
+ u64 imm;
+ int i;
+
+ insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
+ GFP_USER);
+ if (!insns)
+ return insns;
+
+ for (i = 0; i < prog->len; i++) {
+ if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
+ insns[i].code = BPF_JMP | BPF_CALL;
+ insns[i].imm = BPF_FUNC_tail_call;
+ /* fall-through */
+ }
+ if (insns[i].code == (BPF_JMP | BPF_CALL) ||
+ insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
+ if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
+ insns[i].code = BPF_JMP | BPF_CALL;
+ if (!bpf_dump_raw_ok())
+ insns[i].imm = 0;
+ continue;
+ }
+
+ if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+
+ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
+ map = bpf_map_from_imm(prog, imm);
+ if (map) {
+ insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+ insns[i].imm = map->id;
+ insns[i + 1].imm = 0;
+ continue;
+ }
+
+ if (!bpf_dump_raw_ok() &&
+ imm == (unsigned long)prog->aux) {
+ insns[i].imm = 0;
+ insns[i + 1].imm = 0;
+ continue;
+ }
+ }
+
+ return insns;
+}
+
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1598,24 +1723,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
goto done;
}
- ulen = info.jited_prog_len;
- info.jited_prog_len = prog->jited_len;
- if (info.jited_prog_len && ulen) {
- uinsns = u64_to_user_ptr(info.jited_prog_insns);
- ulen = min_t(u32, info.jited_prog_len, ulen);
- if (copy_to_user(uinsns, prog->bpf_func, ulen))
- return -EFAULT;
- }
-
ulen = info.xlated_prog_len;
info.xlated_prog_len = bpf_prog_insn_size(prog);
if (info.xlated_prog_len && ulen) {
+ struct bpf_insn *insns_sanitized;
+ bool fault;
+
+ if (prog->blinded && !bpf_dump_raw_ok()) {
+ info.xlated_prog_insns = 0;
+ goto done;
+ }
+ insns_sanitized = bpf_insn_prepare_dump(prog);
+ if (!insns_sanitized)
+ return -ENOMEM;
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
- if (copy_to_user(uinsns, prog->insnsi, ulen))
+ fault = copy_to_user(uinsns, insns_sanitized, ulen);
+ kfree(insns_sanitized);
+ if (fault)
return -EFAULT;
}
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ err = bpf_prog_offload_info_fill(&info, prog);
+ if (err)
+ return err;
+ goto done;
+ }
+
+ /* NOTE: the following code is supposed to be skipped for offload.
+ * bpf_prog_offload_info_fill() is the place to fill similar fields
+ * for offload.
+ */
+ ulen = info.jited_prog_len;
+ info.jited_prog_len = prog->jited_len;
+ if (info.jited_prog_len && ulen) {
+ if (bpf_dump_raw_ok()) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ } else {
+ info.jited_prog_insns = 0;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1646,6 +1798,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.map_flags = map->map_flags;
memcpy(info.name, map->name, sizeof(map->name));
+ if (bpf_map_is_dev_bound(map)) {
+ err = bpf_map_offload_info_fill(&info, map);
+ if (err)
+ return err;
+ }
+
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20,6 +20,8 @@
#include <linux/file.h>
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include "disasm.h"
@@ -167,11 +169,11 @@ struct bpf_call_arg_meta {
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
- * verbose() is used to dump the verification trace to the log, so the user
- * can figure out what's wrong with the program
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
*/
-static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
- const char *fmt, ...)
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
struct bpf_verifer_log *log = &env->log;
unsigned int n;
@@ -195,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
else
log->ubuf = NULL;
}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+/* Historically bpf_verifier_log_write was called verbose, but the name was too
+ * generic for symbol export. The function was renamed, but not the calls in
+ * the verifier to avoid complicating backports. Hence the alias below.
+ */
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+ __attribute__((alias("bpf_verifier_log_write")));
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
@@ -216,23 +226,48 @@ static const char * const reg_type_str[] = {
[PTR_TO_PACKET_END] = "pkt_end",
};
+static void print_liveness(struct bpf_verifier_env *env,
+ enum bpf_reg_liveness live)
+{
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+ verbose(env, "_");
+ if (live & REG_LIVE_READ)
+ verbose(env, "r");
+ if (live & REG_LIVE_WRITTEN)
+ verbose(env, "w");
+}
+
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
static void print_verifier_state(struct bpf_verifier_env *env,
- struct bpf_verifier_state *state)
+ const struct bpf_func_state *state)
{
- struct bpf_reg_state *reg;
+ const struct bpf_reg_state *reg;
enum bpf_reg_type t;
int i;
+ if (state->frameno)
+ verbose(env, " frame%d:", state->frameno);
for (i = 0; i < MAX_BPF_REG; i++) {
reg = &state->regs[i];
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(env, " R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d", i);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
+ if (t == PTR_TO_STACK)
+ verbose(env, ",call_%d", func(env, reg)->callsite);
} else {
verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
@@ -277,16 +312,21 @@ static void print_verifier_state(struct bpf_verifier_env *env,
}
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] == STACK_SPILL)
- verbose(env, " fp%d=%s",
- -MAX_BPF_STACK + i * BPF_REG_SIZE,
+ if (state->stack[i].slot_type[0] == STACK_SPILL) {
+ verbose(env, " fp%d",
+ (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, state->stack[i].spilled_ptr.live);
+ verbose(env, "=%s",
reg_type_str[state->stack[i].spilled_ptr.type]);
+ }
+ if (state->stack[i].slot_type[0] == STACK_ZERO)
+ verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
}
verbose(env, "\n");
}
-static int copy_stack_state(struct bpf_verifier_state *dst,
- const struct bpf_verifier_state *src)
+static int copy_stack_state(struct bpf_func_state *dst,
+ const struct bpf_func_state *src)
{
if (!src->stack)
return 0;
@@ -302,13 +342,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst,
/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
* make it consume minimal amount of memory. check_stack_write() access from
- * the program calls into realloc_verifier_state() to grow the stack size.
+ * the program calls into realloc_func_state() to grow the stack size.
* Note there is a non-zero 'parent' pointer inside bpf_verifier_state
* which this function copies over. It points to previous bpf_verifier_state
* which is never reallocated
*/
-static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
- bool copy_old)
+static int realloc_func_state(struct bpf_func_state *state, int size,
+ bool copy_old)
{
u32 old_size = state->allocated_stack;
struct bpf_stack_state *new_stack;
@@ -341,10 +381,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
return 0;
}
+static void free_func_state(struct bpf_func_state *state)
+{
+ if (!state)
+ return;
+ kfree(state->stack);
+ kfree(state);
+}
+
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
- kfree(state->stack);
+ int i;
+
+ for (i = 0; i <= state->curframe; i++) {
+ free_func_state(state->frame[i]);
+ state->frame[i] = NULL;
+ }
if (free_self)
kfree(state);
}
@@ -352,18 +405,46 @@ static void free_verifier_state(struct bpf_verifier_state *state,
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
-static int copy_verifier_state(struct bpf_verifier_state *dst,
- const struct bpf_verifier_state *src)
+static int copy_func_state(struct bpf_func_state *dst,
+ const struct bpf_func_state *src)
{
int err;
- err = realloc_verifier_state(dst, src->allocated_stack, false);
+ err = realloc_func_state(dst, src->allocated_stack, false);
if (err)
return err;
- memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
return copy_stack_state(dst, src);
}
+static int copy_verifier_state(struct bpf_verifier_state *dst_state,
+ const struct bpf_verifier_state *src)
+{
+ struct bpf_func_state *dst;
+ int i, err;
+
+ /* if dst has more stack frames then src frame, free them */
+ for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
+ free_func_state(dst_state->frame[i]);
+ dst_state->frame[i] = NULL;
+ }
+ dst_state->curframe = src->curframe;
+ dst_state->parent = src->parent;
+ for (i = 0; i <= src->curframe; i++) {
+ dst = dst_state->frame[i];
+ if (!dst) {
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+ dst_state->frame[i] = dst;
+ }
+ err = copy_func_state(dst, src->frame[i]);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
int *insn_idx)
{
@@ -416,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
}
return &elem->st;
err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
/* pop all elements and return */
while (!pop_stack(env, NULL, NULL));
return NULL;
@@ -425,6 +508,10 @@ err:
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
+#define CALLEE_SAVED_REGS 5
+static const int callee_saved[CALLEE_SAVED_REGS] = {
+ BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
+};
static void __mark_reg_not_init(struct bpf_reg_state *reg);
@@ -449,6 +536,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
+static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
+ reg->off = 0;
+ reg->type = SCALAR_VALUE;
+}
+
static void mark_reg_known_zero(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
@@ -560,6 +654,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
reg->id = 0;
reg->off = 0;
reg->var_off = tnum_unknown;
+ reg->frameno = 0;
__mark_reg_unbounded(reg);
}
@@ -568,8 +663,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs */
- for (regno = 0; regno < MAX_BPF_REG; regno++)
+ /* Something bad happened, let's kill all regs except FP */
+ for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(regs + regno);
return;
}
@@ -587,8 +682,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
- /* Something bad happened, let's kill all regs */
- for (regno = 0; regno < MAX_BPF_REG; regno++)
+ /* Something bad happened, let's kill all regs except FP */
+ for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(regs + regno);
return;
}
@@ -596,8 +691,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
}
static void init_reg_state(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs)
+ struct bpf_func_state *state)
{
+ struct bpf_reg_state *regs = state->regs;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
@@ -608,41 +704,218 @@ static void init_reg_state(struct bpf_verifier_env *env,
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
mark_reg_known_zero(env, regs, BPF_REG_FP);
+ regs[BPF_REG_FP].frameno = state->frameno;
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
}
+#define BPF_MAIN_FUNC (-1)
+static void init_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
+ int callsite, int frameno, int subprogno)
+{
+ state->callsite = callsite;
+ state->frameno = frameno;
+ state->subprogno = subprogno;
+ init_reg_state(env, state);
+}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+static int cmp_subprogs(const void *a, const void *b)
{
- struct bpf_verifier_state *parent = state->parent;
+ return *(int *)a - *(int *)b;
+}
+
+static int find_subprog(struct bpf_verifier_env *env, int off)
+{
+ u32 *p;
+
+ p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
+ sizeof(env->subprog_starts[0]), cmp_subprogs);
+ if (!p)
+ return -ENOENT;
+ return p - env->subprog_starts;
+
+}
+
+static int add_subprog(struct bpf_verifier_env *env, int off)
+{
+ int insn_cnt = env->prog->len;
+ int ret;
+
+ if (off >= insn_cnt || off < 0) {
+ verbose(env, "call to invalid destination\n");
+ return -EINVAL;
+ }
+ ret = find_subprog(env, off);
+ if (ret >= 0)
+ return 0;
+ if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+ verbose(env, "too many subprograms\n");
+ return -E2BIG;
+ }
+ env->subprog_starts[env->subprog_cnt++] = off;
+ sort(env->subprog_starts, env->subprog_cnt,
+ sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
+ /* determine subprog starts. The end is one before the next starts */
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn[i].code != (BPF_JMP | BPF_CALL))
+ continue;
+ if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ continue;
+ if (!env->allow_ptr_leaks) {
+ verbose(env, "function calls to other bpf functions are allowed for root only\n");
+ return -EPERM;
+ }
+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ verbose(env, "function calls in offloaded programs are not supported yet\n");
+ return -EINVAL;
+ }
+ ret = add_subprog(env, i + insn[i].imm + 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (env->log.level > 1)
+ for (i = 0; i < env->subprog_cnt; i++)
+ verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+
+ /* now check that all jumps are within the same subprog */
+ subprog_start = 0;
+ if (env->subprog_cnt == cur_subprog)
+ subprog_end = insn_cnt;
+ else
+ subprog_end = env->subprog_starts[cur_subprog++];
+ for (i = 0; i < insn_cnt; i++) {
+ u8 code = insn[i].code;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ goto next;
+ if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+ goto next;
+ off = i + insn[i].off + 1;
+ if (off < subprog_start || off >= subprog_end) {
+ verbose(env, "jump out of range from insn %d to %d\n", i, off);
+ return -EINVAL;
+ }
+next:
+ if (i == subprog_end - 1) {
+ /* to avoid fall-through from one subprog into another
+ * the last insn of the subprog should be either exit
+ * or unconditional jump back
+ */
+ if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP | BPF_JA)) {
+ verbose(env, "last insn is not an exit or jmp\n");
+ return -EINVAL;
+ }
+ subprog_start = subprog_end;
+ if (env->subprog_cnt == cur_subprog)
+ subprog_end = insn_cnt;
+ else
+ subprog_end = env->subprog_starts[cur_subprog++];
+ }
+ }
+ return 0;
+}
+
+static
+struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent,
+ u32 regno)
+{
+ struct bpf_verifier_state *tmp = NULL;
+
+ /* 'parent' could be a state of caller and
+ * 'state' could be a state of callee. In such case
+ * parent->curframe < state->curframe
+ * and it's ok for r1 - r5 registers
+ *
+ * 'parent' could be a callee's state after it bpf_exit-ed.
+ * In such case parent->curframe > state->curframe
+ * and it's ok for r0 only
+ */
+ if (parent->curframe == state->curframe ||
+ (parent->curframe < state->curframe &&
+ regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
+ (parent->curframe > state->curframe &&
+ regno == BPF_REG_0))
+ return parent;
+
+ if (parent->curframe > state->curframe &&
+ regno >= BPF_REG_6) {
+ /* for callee saved regs we have to skip the whole chain
+ * of states that belong to callee and mark as LIVE_READ
+ * the registers before the call
+ */
+ tmp = parent;
+ while (tmp && tmp->curframe != state->curframe) {
+ tmp = tmp->parent;
+ }
+ if (!tmp)
+ goto bug;
+ parent = tmp;
+ } else {
+ goto bug;
+ }
+ return parent;
+bug:
+ verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
+ verbose(env, "regno %d parent frame %d current frame %d\n",
+ regno, parent->curframe, state->curframe);
+ return NULL;
+}
+
+static int mark_reg_read(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent,
+ u32 regno)
+{
+ bool writes = parent == state->parent; /* Observe write marks */
if (regno == BPF_REG_FP)
/* We don't need to worry about FP liveness because it's read-only */
- return;
+ return 0;
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (state->regs[regno].live & REG_LIVE_WRITTEN)
+ if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
break;
+ parent = skip_callee(env, state, parent, regno);
+ if (!parent)
+ return -EFAULT;
/* ... then we depend on parent's value */
- parent->regs[regno].live |= REG_LIVE_READ;
+ parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
+ writes = true;
}
+ return 0;
}
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
- struct bpf_reg_state *regs = env->cur_state->regs;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs;
if (regno >= MAX_BPF_REG) {
verbose(env, "R%d is invalid\n", regno);
@@ -655,7 +928,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- mark_reg_read(env->cur_state, regno);
+ return mark_reg_read(env, vstate, vstate->parent, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
@@ -686,17 +959,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
}
}
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
static int check_stack_write(struct bpf_verifier_env *env,
- struct bpf_verifier_state *state, int off,
- int size, int value_regno)
+ struct bpf_func_state *state, /* func where register points to */
+ int off, int size, int value_regno)
{
+ struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+ enum bpf_reg_type type;
- err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
- true);
+ err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
if (err)
return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -709,8 +990,9 @@ static int check_stack_write(struct bpf_verifier_env *env,
return -EACCES;
}
+ cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0 &&
- is_spillable_regtype(state->regs[value_regno].type)) {
+ is_spillable_regtype((type = cur->regs[value_regno].type))) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -718,51 +1000,116 @@ static int check_stack_write(struct bpf_verifier_env *env,
return -EACCES;
}
+ if (state != cur && type == PTR_TO_STACK) {
+ verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+ return -EINVAL;
+ }
+
/* save register state */
- state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr = cur->regs[value_regno];
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack[spi].slot_type[i] = STACK_SPILL;
} else {
+ u8 type = STACK_MISC;
+
/* regular write of data into stack */
state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+ /* only mark the slot as written if all 8 bytes were written
+ * otherwise read propagation may incorrectly stop too soon
+ * when stack slots are partially written.
+ * This heuristic means that read propagation will be
+ * conservative, since it will add reg_live_read marks
+ * to stack slots all the way to first state when programs
+ * writes+reads less than 8 bytes
+ */
+ if (size == BPF_REG_SIZE)
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ /* when we zero initialize stack slots mark them as such */
+ if (value_regno >= 0 &&
+ register_is_null(&cur->regs[value_regno]))
+ type = STACK_ZERO;
+
for (i = 0; i < size; i++)
state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
- STACK_MISC;
+ type;
}
return 0;
}
-static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+/* registers of every function are unique and mark_reg_read() propagates
+ * the liveness in the following cases:
+ * - from callee into caller for R1 - R5 that were used as arguments
+ * - from caller into callee for R0 that used as result of the call
+ * - from caller to the same caller skipping states of the callee for R6 - R9,
+ * since R6 - R9 are callee saved by implicit function prologue and
+ * caller's R6 != callee's R6, so when we propagate liveness up to
+ * parent states we need to skip callee states for R6 - R9.
+ *
+ * stack slot marking is different, since stacks of caller and callee are
+ * accessible in both (since caller can pass a pointer to caller's stack to
+ * callee which can pass it to another function), hence mark_stack_slot_read()
+ * has to propagate the stack liveness to all parent states at given frame number.
+ * Consider code:
+ * f1() {
+ * ptr = fp - 8;
+ * *ptr = ctx;
+ * call f2 {
+ * .. = *ptr;
+ * }
+ * .. = *ptr;
+ * }
+ * First *ptr is reading from f1's stack and mark_stack_slot_read() has
+ * to mark liveness at the f1's frame and not f2's frame.
+ * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
+ * to propagate liveness to f2 states at f1's frame level and further into
+ * f1 states at f1's frame level until write into that stack slot
+ */
+static void mark_stack_slot_read(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *state,
+ struct bpf_verifier_state *parent,
+ int slot, int frameno)
{
- struct bpf_verifier_state *parent = state->parent;
+ bool writes = parent == state->parent; /* Observe write marks */
while (parent) {
+ if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
+ /* since LIVE_WRITTEN mark is only done for full 8-byte
+ * write the read marks are conservative and parent
+ * state may not even have the stack allocated. In such case
+ * end the propagation, since the loop reached beginning
+ * of the function
+ */
+ break;
/* if read wasn't screened by an earlier write ... */
- if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
+ if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
break;
/* ... then we depend on parent's value */
- parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
+ parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
+ writes = true;
}
}
static int check_stack_read(struct bpf_verifier_env *env,
- struct bpf_verifier_state *state, int off, int size,
- int value_regno)
+ struct bpf_func_state *reg_state /* func where register points to */,
+ int off, int size, int value_regno)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
u8 *stype;
- if (state->allocated_stack <= slot) {
+ if (reg_state->allocated_stack <= slot) {
verbose(env, "invalid read from stack off %d+0 size %d\n",
off, size);
return -EACCES;
}
- stype = state->stack[spi].slot_type;
+ stype = reg_state->stack[spi].slot_type;
if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
@@ -778,21 +1125,44 @@ static int check_stack_read(struct bpf_verifier_env *env,
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = state->stack[spi].spilled_ptr;
- mark_stack_slot_read(state, spi);
+ state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
+ /* mark reg as written since spilled pointer state likely
+ * has its liveness marks cleared by is_state_visited()
+ * which resets stack/reg liveness for state transitions
+ */
+ state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
+ mark_stack_slot_read(env, vstate, vstate->parent, spi,
+ reg_state->frameno);
return 0;
} else {
+ int zeros = 0;
+
for (i = 0; i < size; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
- verbose(env, "invalid read from stack off %d+%d size %d\n",
- off, i, size);
- return -EACCES;
+ if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+ if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
+ zeros++;
+ continue;
}
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ mark_stack_slot_read(env, vstate, vstate->parent, spi,
+ reg_state->frameno);
+ if (value_regno >= 0) {
+ if (zeros == size) {
+ /* any size read into register is zero extended,
+ * so the whole register == const_zero
+ */
+ __mark_reg_const_zero(&state->regs[value_regno]);
+ } else {
+ /* have read misc data from the stack */
+ mark_reg_unknown(env, state->regs, value_regno);
+ }
+ state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- if (value_regno >= 0)
- /* have read misc data from the stack */
- mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
@@ -817,7 +1187,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
int off, int size, bool zero_size_allowed)
{
- struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
int err;
@@ -978,6 +1349,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+ return reg->type == PTR_TO_CTX;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -1059,6 +1437,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_STACK:
pointer_desc = "stack ";
+ /* The stack spill tracking logic in check_stack_write()
+ * and check_stack_read() relies on stack accesses being
+ * aligned.
+ */
+ strict = true;
break;
default:
break;
@@ -1067,6 +1450,126 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+static int update_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_func_state *func,
+ int off)
+{
+ u16 stack = env->subprog_stack_depth[func->subprogno];
+
+ if (stack >= -off)
+ return 0;
+
+ /* update known max for given subprogram */
+ env->subprog_stack_depth[func->subprogno] = -off;
+ return 0;
+}
+
+/* starting from main bpf function walk all instructions of the function
+ * and recursively walk all callees that given function can call.
+ * Ignore jump and exit insns.
+ * Since recursion is prevented by check_cfg() this algorithm
+ * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+ */
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret_insn[MAX_CALL_FRAMES];
+ int ret_prog[MAX_CALL_FRAMES];
+
+process_func:
+ /* round up to 32-bytes, since this is granularity
+ * of interpreter stack size
+ */
+ depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
+continue_func:
+ if (env->subprog_cnt == subprog)
+ subprog_end = insn_cnt;
+ else
+ subprog_end = env->subprog_starts[subprog];
+ for (; i < subprog_end; i++) {
+ if (insn[i].code != (BPF_JMP | BPF_CALL))
+ continue;
+ if (insn[i].src_reg != BPF_PSEUDO_CALL)
+ continue;
+ /* remember insn and function to return to */
+ ret_insn[frame] = i + 1;
+ ret_prog[frame] = subprog;
+
+ /* find the callee */
+ i = i + insn[i].imm + 1;
+ subprog = find_subprog(env, i);
+ if (subprog < 0) {
+ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+ i);
+ return -EFAULT;
+ }
+ subprog++;
+ frame++;
+ if (frame >= MAX_CALL_FRAMES) {
+ WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
+ return -EFAULT;
+ }
+ goto process_func;
+ }
+ /* end of for() loop means the last insn of the 'subprog'
+ * was reached. Doesn't matter whether it was JA or EXIT
+ */
+ if (frame == 0)
+ return 0;
+ depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ frame--;
+ i = ret_insn[frame];
+ subprog = ret_prog[frame];
+ goto continue_func;
+}
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int idx)
+{
+ int start = idx + insn->imm + 1, subprog;
+
+ subprog = find_subprog(env, start);
+ if (subprog < 0) {
+ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+ start);
+ return -EFAULT;
+ }
+ subprog++;
+ return env->subprog_stack_depth[subprog];
+}
+#endif
+
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+ u64 mask;
+
+ /* clear high bits in bit representation */
+ reg->var_off = tnum_cast(reg->var_off, size);
+
+ /* fix arithmetic bounds */
+ mask = ((u64)1 << (size * 8)) - 1;
+ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+ reg->umin_value &= mask;
+ reg->umax_value &= mask;
+ } else {
+ reg->umin_value = 0;
+ reg->umax_value = mask;
+ }
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -1077,9 +1580,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
+ struct bpf_func_state *state;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -1168,8 +1671,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- if (env->prog->aux->stack_depth < -off)
- env->prog->aux->stack_depth = -off;
+ state = func(env, reg);
+ err = update_stack_depth(env, state, off);
+ if (err)
+ return err;
if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
@@ -1200,9 +1705,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- regs[value_regno].var_off =
- tnum_cast(regs[value_regno].var_off, size);
- __update_reg_bounds(&regs[value_regno]);
+ coerce_reg_to_size(&regs[value_regno], size);
}
return err;
}
@@ -1232,6 +1735,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return -EACCES;
}
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1243,12 +1752,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
-/* Does this register contain a constant zero? */
-static bool register_is_null(struct bpf_reg_state reg)
-{
- return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
-}
-
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized.
@@ -1259,31 +1762,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = env->cur_state;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *reg = cur_regs(env) + regno;
+ struct bpf_func_state *state = func(env, reg);
int off, i, slot, spi;
- if (regs[regno].type != PTR_TO_STACK) {
+ if (reg->type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
- register_is_null(regs[regno]))
+ register_is_null(reg))
return 0;
verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[regs[regno].type],
+ reg_type_str[reg->type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
}
/* Only allow fixed-offset stack reads */
- if (!tnum_is_const(regs[regno].var_off)) {
+ if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];
- tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
+ return -EACCES;
}
- off = regs[regno].off + regs[regno].var_off.value;
+ off = reg->off + reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
@@ -1291,9 +1795,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- if (env->prog->aux->stack_depth < -off)
- env->prog->aux->stack_depth = -off;
-
if (meta && meta->raw_mode) {
meta->access_size = access_size;
meta->regno = regno;
@@ -1301,17 +1802,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
+ u8 *stype;
+
slot = -(off + i) - 1;
spi = slot / BPF_REG_SIZE;
- if (state->allocated_stack <= slot ||
- state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
- STACK_MISC) {
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
- off, i, access_size);
- return -EACCES;
+ if (state->allocated_stack <= slot)
+ goto err;
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ if (*stype == STACK_MISC)
+ goto mark;
+ if (*stype == STACK_ZERO) {
+ /* helper can write anything into the stack */
+ *stype = STACK_MISC;
+ goto mark;
}
+err:
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+mark:
+ /* reading any byte out of 8-byte 'spill_slot' will cause
+ * the whole slot to be marked as 'read'
+ */
+ mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
+ spi, state->frameno);
}
- return 0;
+ return update_stack_depth(env, state, off);
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -1334,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_MEM ||
+ type == ARG_PTR_TO_MEM_OR_NULL ||
+ type == ARG_PTR_TO_UNINIT_MEM;
+}
+
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+ return type == ARG_CONST_SIZE ||
+ type == ARG_CONST_SIZE_OR_ZERO;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
@@ -1383,15 +1912,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_CTX;
if (type != expected_type)
goto err_type;
- } else if (arg_type == ARG_PTR_TO_MEM ||
- arg_type == ARG_PTR_TO_MEM_OR_NULL ||
- arg_type == ARG_PTR_TO_UNINIT_MEM) {
+ } else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (register_is_null(*reg) &&
+ if (register_is_null(reg) &&
arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
@@ -1446,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
false, NULL);
- } else if (arg_type == ARG_CONST_SIZE ||
- arg_type == ARG_CONST_SIZE_OR_ZERO) {
+ } else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
- /* bpf_xxx(..., buf, len) call will access 'len' bytes
- * from stack pointer 'buf'. Check it
- * note: regno == len, regno - 1 == buf
- */
- if (regno == 0) {
- /* kernel subsystem misconfigured verifier */
- verbose(env,
- "ARG_CONST_SIZE cannot be first argument\n");
- return -EACCES;
- }
-
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
*/
-
if (!tnum_is_const(reg->var_off))
/* For unprivileged variable accesses, disable raw
* mode so that the program is required to
@@ -1564,6 +2078,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
+ if (env->subprog_cnt) {
+ verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
@@ -1604,7 +2122,7 @@ error:
return -EINVAL;
}
-static int check_raw_mode(const struct bpf_func_proto *fn)
+static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
int count = 0;
@@ -1619,15 +2137,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- return count > 1 ? -EINVAL : 0;
+ /* We only support one arg being in raw mode at the moment,
+ * which is sufficient for the helper functions we have
+ * right now.
+ */
+ return count <= 1;
+}
+
+static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
+ enum bpf_arg_type arg_next)
+{
+ return (arg_type_is_mem_ptr(arg_curr) &&
+ !arg_type_is_mem_size(arg_next)) ||
+ (!arg_type_is_mem_ptr(arg_curr) &&
+ arg_type_is_mem_size(arg_next));
+}
+
+static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
+{
+ /* bpf_xxx(..., buf, len) call will access 'len'
+ * bytes from memory 'buf'. Both arg types need
+ * to be paired, so make sure there's no buggy
+ * helper function specification.
+ */
+ if (arg_type_is_mem_size(fn->arg1_type) ||
+ arg_type_is_mem_ptr(fn->arg5_type) ||
+ check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
+ check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
+ check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
+ check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ return false;
+
+ return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn)
+{
+ return check_raw_mode_ok(fn) &&
+ check_arg_pair_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
*/
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
+static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
+ struct bpf_func_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
@@ -1644,7 +2199,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
}
}
-static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ int i;
+
+ for (i = 0; i <= vstate->curframe; i++)
+ __clear_all_pkt_pointers(env, vstate->frame[i]);
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller, *callee;
+ int i, subprog, target_insn;
+
+ if (state->curframe + 1 >= MAX_CALL_FRAMES) {
+ verbose(env, "the call stack of %d frames is too deep\n",
+ state->curframe + 2);
+ return -E2BIG;
+ }
+
+ target_insn = *insn_idx + insn->imm;
+ subprog = find_subprog(env, target_insn + 1);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n",
+ target_insn + 1);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ if (state->frame[state->curframe + 1]) {
+ verbose(env, "verifier bug. Frame %d already allocated\n",
+ state->curframe + 1);
+ return -EFAULT;
+ }
+
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ *insn_idx /* callsite */,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog + 1 /* subprog number within this prog */);
+
+ /* copy r1 - r5 args that callee can access */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+
+ /* after the call regsiters r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, caller->regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ }
+
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
+
+ /* and go analyze first insn of the callee */
+ *insn_idx = target_insn;
+
+ if (env->log.level) {
+ verbose(env, "caller:\n");
+ print_verifier_state(env, caller);
+ verbose(env, "callee:\n");
+ print_verifier_state(env, callee);
+ }
+ return 0;
+}
+
+static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller, *callee;
+ struct bpf_reg_state *r0;
+
+ callee = state->frame[state->curframe];
+ r0 = &callee->regs[BPF_REG_0];
+ if (r0->type == PTR_TO_STACK) {
+ /* technically it's ok to return caller's stack pointer
+ * (or caller's caller's pointer) back to the caller,
+ * since these pointers are valid. Only current stack
+ * pointer will be invalid as soon as function exits,
+ * but let's be conservative
+ */
+ verbose(env, "cannot return stack pointer to the caller\n");
+ return -EINVAL;
+ }
+
+ state->curframe--;
+ caller = state->frame[state->curframe];
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+
+ *insn_idx = callee->callsite + 1;
+ if (env->log.level) {
+ verbose(env, "returning from callee:\n");
+ print_verifier_state(env, callee);
+ verbose(env, "to caller at %d:\n", *insn_idx);
+ print_verifier_state(env, caller);
+ }
+ /* clear everything in the callee */
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return 0;
+}
+
+static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
struct bpf_reg_state *regs;
@@ -1661,7 +2330,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
if (env->ops->get_func_proto)
fn = env->ops->get_func_proto(func_id);
-
if (!fn) {
verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
func_id);
@@ -1674,15 +2342,18 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return -EINVAL;
}
+ /* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
+ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+ verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- /* We only support one arg being in raw mode at the moment, which
- * is sufficient for the helper functions we have right now.
- */
- err = check_raw_mode(fn);
+ err = check_func_proto(fn);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
@@ -1696,6 +2367,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose(env, "verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -1766,14 +2444,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
- /* clear high 32 bits */
- reg->var_off = tnum_cast(reg->var_off, 4);
- /* Update bounds */
- __update_reg_bounds(reg);
-}
-
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -1794,6 +2464,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
return res > a;
}
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "math between %s pointer and %lld is not allowed\n",
+ reg_type_str[type], val);
+ return false;
+ }
+
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %d is not allowed\n",
+ reg_type_str[type], reg->off);
+ return false;
+ }
+
+ if (smin == S64_MIN) {
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+ reg_type_str[type]);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "value %lld makes %s pointer be out of bounds\n",
+ smin, reg_type_str[type]);
+ return false;
+ }
+
+ return true;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -1804,7 +2509,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs, *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1815,44 +2522,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
- if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad sbounds\n");
- return -EINVAL;
- }
- if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad ubounds\n");
- return -EINVAL;
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
- if (!env->allow_ptr_leaks)
- verbose(env,
- "R%d 32-bit pointer arithmetic prohibited\n",
- dst);
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
return -EACCES;
}
@@ -1862,6 +2561,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
+ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ return -EINVAL;
+
switch (opcode) {
case BPF_ADD:
/* We can take a fixed offset as long as it doesn't overflow
@@ -1915,9 +2618,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_SUB:
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d tried to subtract pointer from scalar\n",
- dst);
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+ dst);
return -EACCES;
}
/* We don't allow subtraction from FP, because (according to
@@ -1925,9 +2627,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* be able to deal with it.
*/
if (ptr_reg->type == PTR_TO_STACK) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d subtraction from stack pointer prohibited\n",
- dst);
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
+ dst);
return -EACCES;
}
if (known && (ptr_reg->off - smin_val ==
@@ -1976,28 +2677,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_AND:
case BPF_OR:
case BPF_XOR:
- /* bitwise ops on pointers are troublesome, prohibit for now.
- * (However, in principle we could allow some cases, e.g.
- * ptr &= ~3 which would reduce min_value by 3.)
- */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ /* bitwise ops on pointers are troublesome, prohibit. */
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
+ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ return -EINVAL;
+
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
@@ -2008,12 +2711,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- if (BPF_CLASS(insn->code) != BPF_ALU64) {
- /* 32-bit ALU ops are (32,32)->64 */
- coerce_reg_to_32(dst_reg);
- coerce_reg_to_32(&src_reg);
- }
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
@@ -2021,6 +2720,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
+ if (!src_known &&
+ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
switch (opcode) {
case BPF_ADD:
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2149,9 +2863,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_LSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
mark_reg_unknown(env, regs, insn->dst_reg);
break;
@@ -2177,27 +2891,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_RSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
- /* BPF_RSH is an unsigned shift, so make the appropriate casts */
- if (dst_reg->smin_value < 0) {
- if (umin_val) {
- /* Sign bit will be cleared */
- dst_reg->smin_value = 0;
- } else {
- /* Lost sign bit information */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
- } else {
- dst_reg->smin_value =
- (u64)(dst_reg->smin_value) >> umax_val;
- }
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
+ * be negative, then either:
+ * 1) src_reg might be zero, so the sign bit of the result is
+ * unknown, so we lose our signed bounds
+ * 2) it's known negative, thus the unsigned bounds capture the
+ * signed bounds
+ * 3) the signed bounds cross zero, so they tell us nothing
+ * about the result
+ * If the value in dst_reg is known nonnegative, then again the
+ * unsigned bounts capture the signed bounds.
+ * Thus, in all cases it suffices to blow away our signed bounds
+ * and rely on inferring new ones from the unsigned bounds and
+ * var_off of the result.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
if (src_known)
dst_reg->var_off = tnum_rshift(dst_reg->var_off,
umin_val);
@@ -2213,6 +2929,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
break;
}
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->32 */
+ coerce_reg_to_size(dst_reg, 4);
+ coerce_reg_to_size(&src_reg, 4);
+ }
+
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
@@ -2224,10 +2946,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
- int rc;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
@@ -2238,43 +2961,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
- * an arbitrary scalar.
+ * an arbitrary scalar. Disallow all math except
+ * pointer subtraction
*/
- if (!env->allow_ptr_leaks) {
- verbose(env, "R%d pointer %s pointer prohibited\n",
- insn->dst_reg,
- bpf_alu_string[opcode >> 4]);
- return -EACCES;
+ if (opcode == BPF_SUB){
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ return 0;
}
- mark_reg_unknown(env, regs, insn->dst_reg);
- return 0;
+ verbose(env, "R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
} else {
/* scalar += pointer
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
- rc = adjust_ptr_min_max_vals(env, insn,
- src_reg, dst_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* scalar += unknown scalar */
- __mark_reg_unknown(&off_reg);
- return adjust_scalar_min_max_vals(
- env, insn,
- dst_reg, off_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
- rc = adjust_ptr_min_max_vals(env, insn,
- dst_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += scalar */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, *src_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -2283,27 +2992,19 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
off_reg.type = SCALAR_VALUE;
__mark_reg_known(&off_reg, insn->imm);
src_reg = &off_reg;
- if (ptr_reg) { /* pointer += K */
- rc = adjust_ptr_min_max_vals(env, insn,
- ptr_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += K */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, off_reg);
- }
- return rc;
- }
+ if (ptr_reg) /* pointer += K */
+ return adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
}
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, env->cur_state);
+ print_verifier_state(env, state);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, env->cur_state);
+ print_verifier_state(env, state);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -2390,17 +3091,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
mark_reg_unknown(env, regs, insn->dst_reg);
- /* high 32 bits are known zero. */
- regs[insn->dst_reg].var_off = tnum_cast(
- regs[insn->dst_reg].var_off, 4);
- __update_reg_bounds(&regs[insn->dst_reg]);
+ coerce_reg_to_size(&regs[insn->dst_reg], 4);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
regs[insn->dst_reg].type = SCALAR_VALUE;
- __mark_reg_known(regs + insn->dst_reg, insn->imm);
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ __mark_reg_known(regs + insn->dst_reg,
+ insn->imm);
+ } else {
+ __mark_reg_known(regs + insn->dst_reg,
+ (u32)insn->imm);
+ }
}
} else if (opcode > BPF_END) {
@@ -2436,6 +3140,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -2457,14 +3166,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *reg;
u16 new_range;
- int i;
+ int i, j;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -2534,12 +3244,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
/* keep the maximum range already checked */
regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- reg = &state->stack[i].spilled_ptr;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
+ for (j = 0; j <= vstate->curframe; j++) {
+ state = vstate->frame[j];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
+ continue;
+ reg = &state->stack[i].spilled_ptr;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
+ }
}
}
@@ -2777,20 +3490,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
-static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs;
u32 id = regs[regno].id;
- int i;
+ int i, j;
for (i = 0; i < MAX_BPF_REG; i++)
mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+ for (j = 0; j <= vstate->curframe; j++) {
+ state = vstate->frame[j];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
+ continue;
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+ }
}
}
@@ -2890,8 +3607,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
- struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
+ struct bpf_verifier_state *this_branch = env->cur_state;
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
+ struct bpf_reg_state *dst_reg, *other_branch_regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2934,8 +3653,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
dst_reg->type == SCALAR_VALUE &&
- tnum_equals_const(dst_reg->var_off, insn->imm)) {
- if (opcode == BPF_JEQ) {
+ tnum_is_const(dst_reg->var_off)) {
+ if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
+ (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
*/
@@ -2953,6 +3673,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
if (!other_branch)
return -EFAULT;
+ other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
@@ -2965,22 +3686,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE &&
regs[insn->src_reg].type == SCALAR_VALUE) {
if (tnum_is_const(regs[insn->src_reg].var_off))
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg, regs[insn->src_reg].var_off.value,
opcode);
else if (tnum_is_const(dst_reg->var_off))
- reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
&regs[insn->src_reg],
dst_reg->var_off.value, opcode);
else if (opcode == BPF_JEQ || opcode == BPF_JNE)
/* Comparing for equality, we can combine knowledge */
- reg_combine_min_max(&other_branch->regs[insn->src_reg],
- &other_branch->regs[insn->dst_reg],
+ reg_combine_min_max(&other_branch_regs[insn->src_reg],
+ &other_branch_regs[insn->dst_reg],
&regs[insn->src_reg],
&regs[insn->dst_reg], opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
- reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ reg_set_min_max(&other_branch_regs[insn->dst_reg],
dst_reg, insn->imm, opcode);
}
@@ -3001,7 +3722,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level)
- print_verifier_state(env, this_branch);
+ print_verifier_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -3086,6 +3807,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (env->subprog_cnt) {
+ /* when program has LD_ABS insn JITs and interpreter assume
+ * that r1 == ctx == skb which is not the case for callees
+ * that can have arbitrary arguments. It's problematic
+ * for main prog as well since JITs would need to analyze
+ * all functions in order to make proper register save/restore
+ * decisions in the main prog. Hence disallow LD_ABS with calls
+ */
+ verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
+
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
@@ -3262,6 +3995,10 @@ static int check_cfg(struct bpf_verifier_env *env)
int ret = 0;
int i, t;
+ ret = check_subprogs(env);
+ if (ret < 0)
+ return ret;
+
insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
@@ -3294,6 +4031,14 @@ peek_stack:
goto err_free;
if (t + 1 < insn_cnt)
env->explored_states[t + 1] = STATE_LIST_MARK;
+ if (insns[t].src_reg == BPF_PSEUDO_CALL) {
+ env->explored_states[t] = STATE_LIST_MARK;
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
@@ -3412,11 +4157,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
struct idpair *idmap)
{
+ bool equal;
+
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
+ equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+
+ if (rold->type == PTR_TO_STACK)
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
+ */
+ return equal && rold->frameno == rcur->frameno;
+
+ if (equal)
return true;
if (rold->type == NOT_INIT)
@@ -3431,15 +4186,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
} else {
- /* if we knew anything about the old value, we're not
- * equal, because we can't know anything about the
- * scalar value of the pointer in the new value.
+ /* We're trying to use a pointer in place of a scalar.
+ * Even if the scalar was unbounded, this could lead to
+ * pointer leaks because scalars are allowed to leak
+ * while pointers are not. We could make this safe in
+ * special cases if root is calling us, but it's
+ * probably not worth the hassle.
*/
- return rold->umin_value == 0 &&
- rold->umax_value == U64_MAX &&
- rold->smin_value == S64_MIN &&
- rold->smax_value == S64_MAX &&
- tnum_is_unknown(rold->var_off);
+ return false;
}
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
@@ -3489,7 +4243,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_CTX:
case CONST_PTR_TO_MAP:
- case PTR_TO_STACK:
case PTR_TO_PACKET_END:
/* Only valid matches are exact, which memcmp() above
* would have accepted
@@ -3504,8 +4257,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
-static bool stacksafe(struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur,
+static bool stacksafe(struct bpf_func_state *old,
+ struct bpf_func_state *cur,
struct idpair *idmap)
{
int i, spi;
@@ -3523,8 +4276,19 @@ static bool stacksafe(struct bpf_verifier_state *old,
for (i = 0; i < old->allocated_stack; i++) {
spi = i / BPF_REG_SIZE;
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
+ /* explored state didn't use this */
+ continue;
+
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ /* if old state was safe with misc data in the stack
+ * it will be safe with zero-initialized stack.
+ * The opposite is not true
+ */
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
+ continue;
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
/* Ex: old explored (safe) state has STACK_SPILL in
@@ -3581,9 +4345,8 @@ static bool stacksafe(struct bpf_verifier_state *old,
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool states_equal(struct bpf_verifier_env *env,
- struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
+static bool func_states_equal(struct bpf_func_state *old,
+ struct bpf_func_state *cur)
{
struct idpair *idmap;
bool ret = false;
@@ -3607,71 +4370,72 @@ out_free:
return ret;
}
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ int i;
+
+ if (old->curframe != cur->curframe)
+ return false;
+
+ /* for states to be equal callsites have to be the same
+ * and all frame states need to be equivalent
+ */
+ for (i = 0; i <= old->curframe; i++) {
+ if (old->frame[i]->callsite != cur->frame[i]->callsite)
+ return false;
+ if (!func_states_equal(old->frame[i], cur->frame[i]))
+ return false;
+ }
+ return true;
+}
+
/* A write screens off any subsequent reads; but write marks come from the
- * straight-line code between a state and its parent. When we arrive at a
- * jump target (in the first iteration of the propagate_liveness() loop),
- * we didn't arrive by the straight-line code, so read marks in state must
- * propagate to parent regardless of state's write marks.
+ * straight-line code between a state and its parent. When we arrive at an
+ * equivalent state (jump target or such) we didn't arrive by the straight-line
+ * code, so read marks in the state must propagate to the parent regardless
+ * of the state's write marks. That's what 'parent == state->parent' comparison
+ * in mark_reg_read() and mark_stack_slot_read() is for.
*/
-static bool do_propagate_liveness(const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent)
+static int propagate_liveness(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *vstate,
+ struct bpf_verifier_state *vparent)
{
- bool writes = parent == state->parent; /* Observe write marks */
- bool touched = false; /* any changes made? */
- int i;
+ int i, frame, err = 0;
+ struct bpf_func_state *state, *parent;
- if (!parent)
- return touched;
+ if (vparent->curframe != vstate->curframe) {
+ WARN(1, "propagate_live: parent frame %d current frame %d\n",
+ vparent->curframe, vstate->curframe);
+ return -EFAULT;
+ }
/* Propagate read liveness of registers... */
BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
/* We don't need to worry about FP liveness because it's read-only */
for (i = 0; i < BPF_REG_FP; i++) {
- if (parent->regs[i].live & REG_LIVE_READ)
- continue;
- if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
+ if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
continue;
- if (state->regs[i].live & REG_LIVE_READ) {
- parent->regs[i].live |= REG_LIVE_READ;
- touched = true;
+ if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
+ err = mark_reg_read(env, vstate, vparent, i);
+ if (err)
+ return err;
}
}
+
/* ... and stack slots */
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- if (parent->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- if (state->stack[i].slot_type[0] != STACK_SPILL)
- continue;
- if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
- continue;
- if (writes &&
- (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
- continue;
- if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
- parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
- touched = true;
+ for (frame = 0; frame <= vstate->curframe; frame++) {
+ state = vstate->frame[frame];
+ parent = vparent->frame[frame];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
+ continue;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
+ mark_stack_slot_read(env, vstate, vparent, i, frame);
}
}
- return touched;
-}
-
-/* "parent" is "a state from which we reach the current state", but initially
- * it is not the state->parent (i.e. "the state whose straight-line code leads
- * to the current state"), instead it is the state that happened to arrive at
- * a (prunable) equivalent of the current state. See comment above
- * do_propagate_liveness() for consequences of this.
- * This function is just a more efficient way of calling mark_reg_read() or
- * mark_stack_slot_read() on each reg in "parent" that is read in "state",
- * though it requires that parent != state->parent in the call arguments.
- */
-static void propagate_liveness(const struct bpf_verifier_state *state,
- struct bpf_verifier_state *parent)
-{
- while (do_propagate_liveness(state, parent)) {
- /* Something changed, so we need to feed those changes onward */
- state = parent;
- parent = state->parent;
- }
+ return err;
}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -3679,7 +4443,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state;
- int i, err;
+ int i, j, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3700,7 +4464,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- propagate_liveness(&sl->state, cur);
+ err = propagate_liveness(env, &sl->state, cur);
+ if (err)
+ return err;
return 1;
}
sl = sl->next;
@@ -3708,9 +4474,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
/* there were no equivalent states, remember current one.
* technically the current state is not proven to be safe yet,
- * but it will either reach bpf_exit (which means it's safe) or
- * it will be rejected. Since there are no loops, we won't be
- * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ * but it will either reach outer most bpf_exit (which means it's safe)
+ * or it will be rejected. Since there are no loops, we won't be
+ * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
+ * again on the way to bpf_exit
*/
new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
@@ -3734,19 +4501,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* explored_states can get read marks.)
*/
for (i = 0; i < BPF_REG_FP; i++)
- cur->regs[i].live = REG_LIVE_NONE;
- for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
- if (cur->stack[i].slot_type[0] == STACK_SPILL)
- cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
- return 0;
-}
+ cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
-static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
- int insn_idx, int prev_insn_idx)
-{
- if (env->dev_ops && env->dev_ops->insn_hook)
- return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ /* all stack frames are accessible from callee, clear them all */
+ for (j = 0; j <= cur->curframe; j++) {
+ struct bpf_func_state *frame = cur->frame[j];
+ for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+ frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+ }
return 0;
}
@@ -3755,7 +4518,7 @@ static int do_check(struct bpf_verifier_env *env)
struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
- int insn_cnt = env->prog->len;
+ int insn_cnt = env->prog->len, i;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
@@ -3763,9 +4526,18 @@ static int do_check(struct bpf_verifier_env *env)
state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
if (!state)
return -ENOMEM;
- env->cur_state = state;
- init_reg_state(env, state->regs);
+ state->curframe = 0;
state->parent = NULL;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ 0 /* subprogno, zero == main subprog */);
insn_idx = 0;
for (;;) {
struct bpf_insn *insn;
@@ -3812,19 +4584,25 @@ static int do_check(struct bpf_verifier_env *env)
else
verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(env, state);
+ print_verifier_state(env, state->frame[state->curframe]);
do_print_state = false;
}
if (env->log.level) {
+ const struct bpf_insn_cbs cbs = {
+ .cb_print = verbose,
+ };
+
verbose(env, "%d: ", insn_idx);
- print_bpf_insn(verbose, env, insn,
- env->allow_ptr_leaks);
+ print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
}
- err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
- if (err)
- return err;
+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ err = bpf_prog_offload_verify_insn(env, insn_idx,
+ prev_insn_idx);
+ if (err)
+ return err;
+ }
regs = cur_regs(env);
env->insn_aux_data[insn_idx].seen = true;
@@ -3932,6 +4710,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -3945,13 +4729,17 @@ static int do_check(struct bpf_verifier_env *env)
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
- insn->src_reg != BPF_REG_0 ||
+ (insn->src_reg != BPF_REG_0 &&
+ insn->src_reg != BPF_PSEUDO_CALL) ||
insn->dst_reg != BPF_REG_0) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
- err = check_call(env, insn->imm, insn_idx);
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ err = check_func_call(env, insn, &insn_idx);
+ else
+ err = check_helper_call(env, insn->imm, insn_idx);
if (err)
return err;
@@ -3976,6 +4764,16 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
+ if (state->curframe) {
+ /* exit from nested function */
+ prev_insn_idx = insn_idx;
+ err = prepare_func_exit(env, &insn_idx);
+ if (err)
+ return err;
+ do_print_state = true;
+ continue;
+ }
+
/* eBPF calling convetion is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
@@ -4036,8 +4834,17 @@ process_bpf_exit:
insn_idx++;
}
- verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
- env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns (limit %d), stack depth ",
+ insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
+ for (i = 0; i < env->subprog_cnt + 1; i++) {
+ u32 depth = env->subprog_stack_depth[i];
+
+ verbose(env, "%d", depth);
+ if (i + 1 < env->subprog_cnt + 1)
+ verbose(env, "+");
+ }
+ verbose(env, "\n");
+ env->prog->aux->stack_depth = env->subprog_stack_depth[0];
return 0;
}
@@ -4070,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
}
+
+ if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ !bpf_offload_dev_match(prog, map)) {
+ verbose(env, "offload device mismatch between prog and map\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -4167,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
next_insn:
insn++;
i++;
+ continue;
+ }
+
+ /* Basic sanity check before we invest more work here. */
+ if (!bpf_opcode_in_insntable(insn->code)) {
+ verbose(env, "unknown opcode %02x\n", insn->code);
+ return -EINVAL;
}
}
@@ -4223,6 +5044,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
return 0;
}
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (env->subprog_starts[i] < off)
+ continue;
+ env->subprog_starts[i] += len - 1;
+ }
+}
+
static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
const struct bpf_insn *patch, u32 len)
{
@@ -4233,17 +5067,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return NULL;
if (adjust_insn_aux_data(env, new_prog->len, off, len))
return NULL;
+ adjust_subprog_starts(env, off, len);
return new_prog;
}
-/* The verifier does more data flow analysis than llvm and will not explore
- * branches that are dead at run time. Malicious programs can have dead code
- * too. Therefore replace all dead at-run-time code with nops.
+/* The verifier does more data flow analysis than llvm and will not
+ * explore branches that are dead at run time. Malicious programs can
+ * have dead code too. Therefore replace all dead at-run-time code
+ * with 'ja -1'.
+ *
+ * Just nops are not optimal, e.g. if they would sit at the end of the
+ * program and through another bug we would manage to jump there, then
+ * we'd execute beyond program memory otherwise. Returning exception
+ * code also wouldn't work since we can have subprogs where the dead
+ * code could be located.
*/
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
- struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
struct bpf_insn *insn = env->prog->insnsi;
const int insn_cnt = env->prog->len;
int i;
@@ -4251,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
if (aux_data[i].seen)
continue;
- memcpy(insn + i, &nop, sizeof(nop));
+ memcpy(insn + i, &trap, sizeof(trap));
}
}
@@ -4367,6 +5209,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return 0;
}
+static int jit_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog, **func, *tmp;
+ int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_insn *insn;
+ void *old_bpf_func;
+ int err = -ENOMEM;
+
+ if (env->subprog_cnt == 0)
+ return 0;
+
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ subprog = find_subprog(env, i + insn->imm + 1);
+ if (subprog < 0) {
+ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+ i + insn->imm + 1);
+ return -EFAULT;
+ }
+ /* temporarily remember subprog id inside insn instead of
+ * aux_data, since next loop will split up all insns into funcs
+ */
+ insn->off = subprog + 1;
+ /* remember original imm in case JIT fails and fallback
+ * to interpreter will be needed
+ */
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* point imm to __bpf_call_base+1 from JITs point of view */
+ insn->imm = 1;
+ }
+
+ func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ subprog_start = subprog_end;
+ if (env->subprog_cnt == i)
+ subprog_end = prog->len;
+ else
+ subprog_end = env->subprog_starts[i];
+
+ len = subprog_end - subprog_start;
+ func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+ if (!func[i])
+ goto out_free;
+ memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+ len * sizeof(struct bpf_insn));
+ func[i]->type = prog->type;
+ func[i]->len = len;
+ if (bpf_prog_calc_tag(func[i]))
+ goto out_free;
+ func[i]->is_func = 1;
+ /* Use bpf_prog_F_tag to indicate functions in stack traces.
+ * Long term would need debug info to populate names
+ */
+ func[i]->aux->name[0] = 'F';
+ func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+ func[i]->jit_requested = 1;
+ func[i] = bpf_int_jit_compile(func[i]);
+ if (!func[i]->jited) {
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+ /* at this point all bpf functions were successfully JITed
+ * now populate all bpf_calls with correct addresses and
+ * run last pass of JIT
+ */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ subprog = insn->off;
+ insn->off = 0;
+ insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
+ func[subprog]->bpf_func -
+ __bpf_call_base;
+ }
+ }
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ old_bpf_func = func[i]->bpf_func;
+ tmp = bpf_int_jit_compile(func[i]);
+ if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
+ verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
+ err = -EFAULT;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /* finally lock prog and jit images for all functions and
+ * populate kallsysm
+ */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ bpf_prog_lock_ro(func[i]);
+ bpf_prog_kallsyms_add(func[i]);
+ }
+
+ /* Last step: make now unused interpreter insns from main
+ * prog consistent for later dump requests, so they can
+ * later look the same as if they were interpreted only.
+ */
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ unsigned long addr;
+
+ if (insn->code != (BPF_JMP | BPF_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ insn->off = env->insn_aux_data[i].call_imm;
+ subprog = find_subprog(env, i + insn->off + 1);
+ addr = (unsigned long)func[subprog + 1]->bpf_func;
+ addr &= PAGE_MASK;
+ insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
+ addr - __bpf_call_base;
+ }
+
+ prog->jited = 1;
+ prog->bpf_func = func[0]->bpf_func;
+ prog->aux->func = func;
+ prog->aux->func_cnt = env->subprog_cnt + 1;
+ return 0;
+out_free:
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (func[i])
+ bpf_jit_free(func[i]);
+ kfree(func);
+ /* cleanup main prog to be interpreted */
+ prog->jit_requested = 0;
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ insn->off = 0;
+ insn->imm = env->insn_aux_data[i].call_imm;
+ }
+ return err;
+}
+
+static int fixup_call_args(struct bpf_verifier_env *env)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ int i, depth;
+#endif
+ int err;
+
+ err = 0;
+ if (env->prog->jit_requested) {
+ err = jit_subprogs(env);
+ if (err == 0)
+ return 0;
+ }
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ for (i = 0; i < prog->len; i++, insn++) {
+ if (insn->code != (BPF_JMP | BPF_CALL) ||
+ insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ depth = get_callee_stack_depth(env, insn, i);
+ if (depth < 0)
+ return depth;
+ bpf_patch_call_args(insn, depth);
+ }
+ err = 0;
+#endif
+ return err;
+}
+
/* fixup insn->imm field of bpf_call instructions
* and inline eligible helpers as explicit sequence of BPF instructions
*
@@ -4384,13 +5400,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ struct bpf_insn mask_and_div[] = {
+ BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+ /* Rx div 0 -> 0 */
+ BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+ BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ *insn,
+ };
+ struct bpf_insn mask_and_mod[] = {
+ BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+ /* Rx mod 0 -> Rx */
+ BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+ *insn,
+ };
+ struct bpf_insn *patchlet;
+
+ if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ patchlet = mask_and_div + (is64 ? 1 : 0);
+ cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
+ } else {
+ patchlet = mask_and_mod + (is64 ? 1 : 0);
+ cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ continue;
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
if (insn->imm == BPF_FUNC_get_prandom_u32)
bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_override_return)
+ prog->kprobe_override = 1;
if (insn->imm == BPF_FUNC_tail_call) {
/* If we tail call into other programs, we
* cannot make any assumptions since they can
@@ -4407,13 +5467,42 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* handlers are currently limited to 64 bit only.
*/
- if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON ||
@@ -4548,7 +5637,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
- if (env->prog->aux->offload) {
+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env);
if (ret)
goto err_unlock;
@@ -4565,12 +5654,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->explored_states)
goto skip_full_check;
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
ret = do_check(env);
if (env->cur_state) {
free_verifier_state(env->cur_state, true);
@@ -4585,12 +5674,18 @@ skip_full_check:
sanitize_dead_code(env);
if (ret == 0)
+ ret = check_max_stack_depth(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = fixup_bpf_calls(env);
+ if (ret == 0)
+ ret = fixup_call_args(env);
+
if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
if (log->level && !log->ubuf) {
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..8cda3bc3ae22 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
root->flags = opts->flags;
if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
+ strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
- strcpy(root->name, opts->name);
+ strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *l = it->task_pos;
+ struct list_head *next;
lockdep_assert_held(&css_set_lock);
- WARN_ON_ONCE(!l);
-
repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
- l = l->next;
+ next = it->task_pos->next;
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (l == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
- it->task_pos = l;
+ it->task_pos = next;
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f7efa7b4d825..b42037e6e81d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1254,9 +1254,9 @@ done:
return retval;
}
-int current_cpuset_is_being_rebound(void)
+bool current_cpuset_is_being_rebound(void)
{
- int ret;
+ bool ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 133b465691d6..1e111dd455c4 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp)
}
/* ->updated_children list is self terminated */
- for_each_possible_cpu(cpu)
- cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ cstat->updated_children = cgrp;
+ u64_stats_init(&cstat->sync);
+ }
prev_cputime_init(&cgrp->stat.prev_cputime);
diff --git a/kernel/compat.c b/kernel/compat.c
index d1cee656a7ed..3247fe761f60 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -355,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
ret = -EFAULT;
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 8d9643767142..108fecc20fc1 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -18,6 +18,7 @@ CONFIG_VIRTUALIZATION=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_KVM_GUEST=y
+CONFIG_S390_GUEST=y
CONFIG_VIRTIO=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_BLK=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
new file mode 100644
index 000000000000..81ff07863576
--- /dev/null
+++ b/kernel/configs/nopm.config
@@ -0,0 +1,15 @@
+CONFIG_PM=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+
+# Triggers PM on OMAP
+CONFIG_CPU_IDLE=n
+
+# Triggers enablement via hibernate callbacks
+CONFIG_XEN=n
+
+# ARM/ARM64 architectures that select PM unconditionally
+CONFIG_ARCH_OMAP2PLUS_TYPICAL=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_VEXPRESS=n
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 7fa0c4ae6394..9bfdffc100da 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -10,3 +10,7 @@ CONFIG_OPTIMIZE_INLINING=y
# CONFIG_SLAB is not set
# CONFIG_SLUB is not set
CONFIG_SLOB=y
+CONFIG_CC_STACKPROTECTOR_NONE=y
+# CONFIG_CC_STACKPROTECTOR_REGULAR is not set
+# CONFIG_CC_STACKPROTECTOR_STRONG is not set
+# CONFIG_CC_STACKPROTECTOR_AUTO is not set
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
{
lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
{
lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else
-static void inline cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
#endif
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
- [CPUHP_TIMERS_DEAD] = {
+ [CPUHP_TIMERS_PREPARE] = {
.name = "timers:dead",
- .startup.single = NULL,
+ .startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index c8146d53ca67..dbb0781a0533 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv)
long sig, pid;
char *endp;
struct task_struct *p;
- struct siginfo info;
if (argc != 2)
return KDB_ARGCOUNT;
@@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv)
sig = simple_strtol(argv[1], &endp, 0);
if (*endp)
return KDB_BADINT;
- if (sig >= 0) {
+ if ((sig >= 0) || !valid_signal(-sig)) {
kdb_printf("Invalid signal parameter.<-signal>\n");
return 0;
}
@@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv)
return 0;
}
p = p->group_leader;
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = pid; /* same capabilities as process being signalled */
- info.si_uid = 0; /* kdb has root authority */
- kdb_send_sig_info(p, &info);
+ kdb_send_sig(p, sig);
return 0;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index fc224fbcf954..1e5a502ba4a7 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
-extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_send_sig(struct task_struct *p, int sig);
extern void kdb_meminfo_proc_show(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
}
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ struct task_delay_info *delays = p->delays;
+ u64 *total;
+ u32 *count;
+
+ if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+ total = &delays->swapin_delay;
+ count = &delays->swapin_count;
+ } else {
+ total = &delays->blkio_delay;
+ count = &delays->blkio_count;
+ }
+
+ delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(
+ &current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1b2be63c8528..772a43fea825 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -179,21 +179,6 @@ put_callchain_entry(int rctx)
}
struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
-{
- bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
- /* Disallow cross-task user callchains. */
- bool crosstask = event->ctx->task && event->ctx->task != current;
- const u32 max_stack = event->attr.sample_max_stack;
-
- if (!kernel && !user)
- return NULL;
-
- return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
-}
-
-struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b695bf0d..96db9ae5d5af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ LIST_HEAD(free_list);
/*
* If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ again:
struct perf_event, child_list);
if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP);
- list_del(&child->child_list);
- free_event(child);
+ list_move(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
@@ -4284,6 +4288,11 @@ again:
}
mutex_unlock(&event->child_mutex);
+ list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ list_del(&child->child_list);
+ free_event(child);
+ }
+
no_ctx:
put_event(event); /* Must be the 'last' reference */
return 0;
@@ -4511,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return ret;
}
-static unsigned int perf_poll(struct file *file, poll_table *wait)
+static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
- unsigned int events = POLLHUP;
+ __poll_t events = EPOLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -4723,6 +4732,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
rcu_read_unlock();
return 0;
}
+
+ case PERF_EVENT_IOC_QUERY_BPF:
+ return perf_event_query_prog_array(event, (void __user *)arg);
default:
return -ENOTTY;
}
@@ -4904,6 +4916,7 @@ void perf_event_update_userpage(struct perf_event *event)
unlock:
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static int perf_mmap_fault(struct vm_fault *vmf)
{
@@ -5815,19 +5828,11 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_read(handle, event);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (data->callchain) {
- int size = 1;
-
- if (data->callchain)
- size += data->callchain->nr;
-
- size *= sizeof(u64);
+ int size = 1;
- __output_copy(handle, data->callchain, size);
- } else {
- u64 nr = 0;
- perf_output_put(handle, nr);
- }
+ size += data->callchain->nr;
+ size *= sizeof(u64);
+ __output_copy(handle, data->callchain, size);
}
if (sample_type & PERF_SAMPLE_RAW) {
@@ -5980,6 +5985,26 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
+static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+
+static struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user;
+ /* Disallow cross-task user callchains. */
+ bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
+ struct perf_callchain_entry *callchain;
+
+ if (!kernel && !user)
+ return &__empty_callchain;
+
+ callchain = get_perf_callchain(regs, 0, kernel, user,
+ max_stack, crosstask, true);
+ return callchain ?: &__empty_callchain;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6002,9 +6027,7 @@ void perf_prepare_sample(struct perf_event_header *header,
int size = 1;
data->callchain = perf_callchain(event, regs);
-
- if (data->callchain)
- size += data->callchain->nr;
+ size += data->callchain->nr;
header->size += size * sizeof(u64);
}
@@ -8080,6 +8103,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EINVAL;
}
+ /* Kprobe override only works for kprobes, not uprobes. */
+ if (prog->kprobe_override &&
+ !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
@@ -8516,6 +8546,29 @@ fail_clear_files:
return ret;
}
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+ * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+ * perf_event_ctx_lock() we already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx, but that
+ * does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
@@ -8532,8 +8585,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = ftrace_profile_set_filter(event, event->attr.config,
- filter_str);
+ ret = perf_tracepoint_set_filter(event, filter_str);
else if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
@@ -9168,7 +9220,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;
- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
@@ -10703,6 +10761,19 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+
+ if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
+ !child_ctx->task_ctx_data) {
+ struct pmu *pmu = child_event->pmu;
+
+ child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
+ GFP_KERNEL);
+ if (!child_ctx->task_ctx_data) {
+ free_event(child_event);
+ return NULL;
+ }
+ }
+
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
* must be under the same lock in order to serialize against
@@ -10713,6 +10784,7 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
+ /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09b1537ae06c..6dc725a7e7bc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-/* Callchain handling */
-extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
-
static inline int get_recursion_context(int *recursion)
{
int rctx;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 141aa2ca8728..6c6b3c48db71 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,7 +19,7 @@
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, POLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 267f6ef91d97..ce6848e46e94 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
ret = 0;
- smp_wmb(); /* pairs with get_xol_area() */
- mm->uprobes_state.xol_area = area;
+ /* pairs with get_xol_area() */
+ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
up_write(&mm->mmap_sem);
@@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void)
if (!mm->uprobes_state.xol_area)
__create_xol_area(0);
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
return area;
}
@@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void)
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
- area = current->mm->uprobes_state.xol_area;
- smp_read_barrier_depends();
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
if (area)
trampoline_vaddr = area->vaddr;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,12 @@ Efault:
return -EFAULT;
}
#endif
+
+__weak void abort(void)
+{
+ BUG();
+
+ /* if that doesn't kill us, halt */
+ panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include <linux/error-injection.h>
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+
+struct fei_attr {
+ struct list_head list;
+ struct kprobe kp;
+ unsigned long retval;
+};
+static DEFINE_MUTEX(fei_lock);
+static LIST_HEAD(fei_attr_list);
+static DECLARE_FAULT_ATTR(fei_fault_attr);
+static struct dentry *fei_debugfs_dir;
+
+static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
+{
+ switch (get_injectable_error_type(addr)) {
+ case EI_ETYPE_NULL:
+ if (retv != 0)
+ return 0;
+ break;
+ case EI_ETYPE_ERRNO:
+ if (retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ case EI_ETYPE_ERRNO_NULL:
+ if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ }
+
+ return retv;
+}
+
+static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
+{
+ struct fei_attr *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (attr) {
+ attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
+ if (!attr->kp.symbol_name) {
+ kfree(attr);
+ return NULL;
+ }
+ attr->kp.pre_handler = fei_kprobe_handler;
+ attr->retval = adjust_error_retval(addr, 0);
+ INIT_LIST_HEAD(&attr->list);
+ }
+ return attr;
+}
+
+static void fei_attr_free(struct fei_attr *attr)
+{
+ if (attr) {
+ kfree(attr->kp.symbol_name);
+ kfree(attr);
+ }
+}
+
+static struct fei_attr *fei_attr_lookup(const char *sym)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (!strcmp(attr->kp.symbol_name, sym))
+ return attr;
+ }
+
+ return NULL;
+}
+
+static bool fei_attr_is_valid(struct fei_attr *_attr)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (attr == _attr)
+ return true;
+ }
+
+ return false;
+}
+
+static int fei_retval_set(void *data, u64 val)
+{
+ struct fei_attr *attr = data;
+ unsigned long retv = (unsigned long)val;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /*
+ * Since this operation can be done after retval file is removed,
+ * It is safer to check the attr is still valid before accessing
+ * its member.
+ */
+ if (!fei_attr_is_valid(attr)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (attr->kp.addr) {
+ if (adjust_error_retval((unsigned long)attr->kp.addr,
+ val) != retv)
+ err = -EINVAL;
+ }
+ if (!err)
+ attr->retval = val;
+out:
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+
+static int fei_retval_get(void *data, u64 *val)
+{
+ struct fei_attr *attr = data;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /* Here we also validate @attr to ensure it still exists. */
+ if (!fei_attr_is_valid(attr))
+ err = -ENOENT;
+ else
+ *val = attr->retval;
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
+ "%llx\n");
+
+static int fei_debugfs_add_attr(struct fei_attr *attr)
+{
+ struct dentry *dir;
+
+ dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
+ if (!dir)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void fei_debugfs_remove_attr(struct fei_attr *attr)
+{
+ struct dentry *dir;
+
+ dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
+ if (dir)
+ debugfs_remove_recursive(dir);
+}
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
+
+ if (should_fail(&fei_fault_attr, 1)) {
+ regs_set_return_value(regs, attr->retval);
+ override_function_with_return(regs);
+ /* Kprobe specific fixup */
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&fei_lock);
+ return seq_list_start(&fei_attr_list, *pos);
+}
+
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&fei_lock);
+}
+
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &fei_attr_list, pos);
+}
+
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+ struct fei_attr *attr = list_entry(v, struct fei_attr, list);
+
+ seq_printf(m, "%pf\n", attr->kp.addr);
+ return 0;
+}
+
+static const struct seq_operations fei_seq_ops = {
+ .start = fei_seq_start,
+ .next = fei_seq_next,
+ .stop = fei_seq_stop,
+ .show = fei_seq_show,
+};
+
+static int fei_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fei_seq_ops);
+}
+
+static void fei_attr_remove(struct fei_attr *attr)
+{
+ fei_debugfs_remove_attr(attr);
+ unregister_kprobe(&attr->kp);
+ list_del(&attr->list);
+ fei_attr_free(attr);
+}
+
+static void fei_attr_remove_all(void)
+{
+ struct fei_attr *attr, *n;
+
+ list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
+ fei_attr_remove(attr);
+ }
+}
+
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct fei_attr *attr;
+ unsigned long addr;
+ char *buf, *sym;
+ int ret;
+
+ /* cut off if it is too long */
+ if (count > KSYM_NAME_LEN)
+ count = KSYM_NAME_LEN;
+ buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, buffer, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ buf[count] = '\0';
+ sym = strstrip(buf);
+
+ mutex_lock(&fei_lock);
+
+ /* Writing just spaces will remove all injection points */
+ if (sym[0] == '\0') {
+ fei_attr_remove_all();
+ ret = count;
+ goto out;
+ }
+ /* Writing !function will remove one injection point */
+ if (sym[0] == '!') {
+ attr = fei_attr_lookup(sym + 1);
+ if (!attr) {
+ ret = -ENOENT;
+ goto out;
+ }
+ fei_attr_remove(attr);
+ ret = count;
+ goto out;
+ }
+
+ addr = kallsyms_lookup_name(sym);
+ if (!addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!within_error_injection_list(addr)) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (fei_attr_lookup(sym)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ attr = fei_attr_new(sym, addr);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = register_kprobe(&attr->kp);
+ if (!ret)
+ ret = fei_debugfs_add_attr(attr);
+ if (ret < 0)
+ fei_attr_remove(attr);
+ else {
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
+ }
+out:
+ kfree(buf);
+ mutex_unlock(&fei_lock);
+ return ret;
+}
+
+static const struct file_operations fei_ops = {
+ .open = fei_open,
+ .read = seq_read,
+ .write = fei_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init fei_debugfs_init(void)
+{
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_function", NULL,
+ &fei_fault_attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ /* injectable attribute is just a symlink of error_inject/list */
+ if (!debugfs_create_symlink("injectable", dir,
+ "../error_injection/list"))
+ goto error;
+
+ if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
+ goto error;
+
+ fei_debugfs_dir = dir;
+
+ return 0;
+error:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+}
+
+late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf6b58c..be8aa5b98666 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
+#include <linux/sched/mm.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk)
void thread_stack_cache_init(void)
{
- thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
- THREAD_SIZE, 0, NULL);
+ thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
+ THREAD_SIZE, THREAD_SIZE, 0, 0,
+ THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
# endif
@@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+#ifdef CONFIG_MMU
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ struct mm_struct *oldmm)
+{
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+ struct rb_node **rb_link, *rb_parent;
+ int retval;
+ unsigned long charge;
+ LIST_HEAD(uf);
+
+ uprobe_start_dup_mmap();
+ if (down_write_killable(&oldmm->mmap_sem)) {
+ retval = -EINTR;
+ goto fail_uprobe_end;
+ }
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ rb_link = &mm->mm_rb.rb_node;
+ rb_parent = NULL;
+ pprev = &mm->mmap;
+ retval = ksm_fork(mm, oldmm);
+ if (retval)
+ goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
+
+ prev = NULL;
+ for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ struct file *file;
+
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+ tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+ INIT_LIST_HEAD(&tmp->anon_vma_chain);
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /* VM_WIPEONFORK gets a clean slate in the child. */
+ tmp->anon_vma = NULL;
+ if (anon_vma_prepare(tmp))
+ goto fail_nomem_anon_vma_fork;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ tmp->vm_next = tmp->vm_prev = NULL;
+ file = tmp->vm_file;
+ if (file) {
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ if (tmp->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+ i_mmap_lock_write(mapping);
+ if (tmp->vm_flags & VM_SHARED)
+ atomic_inc(&mapping->i_mmap_writable);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ /*
+ * Clear hugetlb-related page reserves for children. This only
+ * affects MAP_PRIVATE mappings. Faults generated by the child
+ * are not guaranteed to succeed, even if read-only
+ */
+ if (is_vm_hugetlb_page(tmp))
+ reset_vma_resv_huge_pages(tmp);
+
+ /*
+ * Link in the new vma and copy the page table entries.
+ */
+ *pprev = tmp;
+ pprev = &tmp->vm_next;
+ tmp->vm_prev = prev;
+ prev = tmp;
+
+ __vma_link_rb(mm, tmp, rb_link, rb_parent);
+ rb_link = &tmp->vm_rb.rb_right;
+ rb_parent = &tmp->vm_rb;
+
+ mm->map_count++;
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(mm, oldmm, mpnt);
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ if (retval)
+ goto out;
+ }
+ /* a new mm has just been created */
+ arch_dup_mmap(oldmm, mm);
+ retval = 0;
+out:
+ up_write(&mm->mmap_sem);
+ flush_tlb_mm(oldmm);
+ up_write(&oldmm->mmap_sem);
+ dup_userfaultfd_complete(&uf);
+fail_uprobe_end:
+ uprobe_end_dup_mmap();
+ return retval;
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+ mm->pgd = pgd_alloc(mm);
+ if (unlikely(!mm->pgd))
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+ pgd_free(mm, mm->pgd);
+}
+#else
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ down_write(&oldmm->mmap_sem);
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ up_write(&oldmm->mmap_sem);
+ return 0;
+}
+#define mm_alloc_pgd(mm) (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+static void __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
+ hmm_mm_destroy(mm);
+ mmu_notifier_mm_destroy(mm);
+ check_mm(mm);
+ put_user_ns(mm->user_ns);
+ free_mm(mm);
+}
+
+void mmdrop(struct mm_struct *mm)
+{
+ /*
+ * The implicit full barrier implied by atomic_dec_and_test() is
+ * required by the membarrier system call before returning to
+ * user-space, after storing to rq->curr.
+ */
+ if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+ __mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmdrop);
+
+static void mmdrop_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm;
+
+ mm = container_of(work, struct mm_struct, async_put_work);
+ __mmdrop(mm);
+}
+
+static void mmdrop_async(struct mm_struct *mm)
+{
+ if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+ INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
@@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
+static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ /* Fetch thread_struct whitelist for the architecture. */
+ arch_thread_struct_whitelist(offset, size);
+
+ /*
+ * Handle zero-sized whitelist or empty thread_struct, otherwise
+ * adjust offset to position of thread_struct in task_struct.
+ */
+ if (unlikely(*size == 0))
+ *offset = 0;
+ else
+ *offset += offsetof(struct task_struct, thread);
+}
+
void __init fork_init(void)
{
int i;
@@ -465,11 +722,14 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN 0
#endif
int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ unsigned long useroffset, usersize;
/* create a slab on which task_structs can be allocated */
- task_struct_cachep = kmem_cache_create("task_struct",
+ task_struct_whitelist(&useroffset, &usersize);
+ task_struct_cachep = kmem_cache_create_usercopy("task_struct",
arch_task_struct_size, align,
- SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT,
+ useroffset, usersize, NULL);
#endif
/* do the arch specific task caches init */
@@ -594,182 +854,8 @@ free_tsk:
return NULL;
}
-#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
- struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
- int retval;
- unsigned long charge;
- LIST_HEAD(uf);
-
- uprobe_start_dup_mmap();
- if (down_write_killable(&oldmm->mmap_sem)) {
- retval = -EINTR;
- goto fail_uprobe_end;
- }
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-
- mm->total_vm = oldmm->total_vm;
- mm->data_vm = oldmm->data_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- retval = khugepaged_fork(mm, oldmm);
- if (retval)
- goto out;
-
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
- struct file *file;
-
- if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!tmp)
- goto fail_nomem;
- *tmp = *mpnt;
- INIT_LIST_HEAD(&tmp->anon_vma_chain);
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- retval = dup_userfaultfd(tmp, &uf);
- if (retval)
- goto fail_nomem_anon_vma_fork;
- if (tmp->vm_flags & VM_WIPEONFORK) {
- /* VM_WIPEONFORK gets a clean slate in the child. */
- tmp->anon_vma = NULL;
- if (anon_vma_prepare(tmp))
- goto fail_nomem_anon_vma_fork;
- } else if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
- tmp->vm_next = tmp->vm_prev = NULL;
- file = tmp->vm_file;
- if (file) {
- struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
- i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- /*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
- */
- if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
-
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
-
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
-
- mm->map_count++;
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(mm, oldmm, mpnt);
-
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
- if (retval)
- goto out;
- }
- /* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
-out:
- up_write(&mm->mmap_sem);
- flush_tlb_mm(oldmm);
- up_write(&oldmm->mmap_sem);
- dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
- uprobe_end_dup_mmap();
- return retval;
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- kmem_cache_free(vm_area_cachep, tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto out;
-}
-
-static inline int mm_alloc_pgd(struct mm_struct *mm)
-{
- mm->pgd = pgd_alloc(mm);
- if (unlikely(!mm->pgd))
- return -ENOMEM;
- return 0;
-}
-
-static inline void mm_free_pgd(struct mm_struct *mm)
-{
- pgd_free(mm, mm->pgd);
-}
-#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- down_write(&oldmm->mmap_sem);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
- up_write(&oldmm->mmap_sem);
- return 0;
-}
-#define mm_alloc_pgd(mm) (0)
-#define mm_free_pgd(mm)
-#endif /* CONFIG_MMU */
-
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
-
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
@@ -859,27 +945,6 @@ fail_nopgd:
return NULL;
}
-static void check_mm(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
-
- if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
- }
-
- if (mm_pgtables_bytes(mm))
- pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
- mm_pgtables_bytes(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
/*
* Allocate and initialize an mm_struct.
*/
@@ -895,24 +960,6 @@ struct mm_struct *mm_alloc(void)
return mm_init(mm, current, current_user_ns());
}
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
-{
- BUG_ON(mm == &init_mm);
- mm_free_pgd(mm);
- destroy_context(mm);
- hmm_mm_destroy(mm);
- mmu_notifier_mm_destroy(mm);
- check_mm(mm);
- put_user_ns(mm->user_ns);
- free_mm(mm);
-}
-EXPORT_SYMBOL_GPL(__mmdrop);
-
static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -1545,6 +1592,10 @@ static __latent_entropy struct task_struct *copy_process(
int retval;
struct task_struct *p;
+ /*
+ * Don't allow sharing the root directory with processes in a different
+ * namespace
+ */
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -2020,6 +2071,8 @@ long _do_fork(unsigned long clone_flags,
int __user *child_tidptr,
unsigned long tls)
{
+ struct completion vfork;
+ struct pid *pid;
struct task_struct *p;
int trace = 0;
long nr;
@@ -2045,43 +2098,40 @@ long _do_fork(unsigned long clone_flags,
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
add_latent_entropy();
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
- if (!IS_ERR(p)) {
- struct completion vfork;
- struct pid *pid;
+ trace_sched_process_fork(current, p);
- trace_sched_process_fork(current, p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
- pid = get_task_pid(p, PIDTYPE_PID);
- nr = pid_vnr(pid);
+ if (clone_flags & CLONE_PARENT_SETTID)
+ put_user(nr, parent_tidptr);
- if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
-
- if (clone_flags & CLONE_VFORK) {
- p->vfork_done = &vfork;
- init_completion(&vfork);
- get_task_struct(p);
- }
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ get_task_struct(p);
+ }
- wake_up_new_task(p);
+ wake_up_new_task(p);
- /* forking complete and child started to run, tell ptracer */
- if (unlikely(trace))
- ptrace_event_pid(trace, pid);
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event_pid(trace, pid);
- if (clone_flags & CLONE_VFORK) {
- if (!wait_for_vfork_done(p, &vfork))
- ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
- }
-
- put_pid(pid);
- } else {
- nr = PTR_ERR(p);
+ if (clone_flags & CLONE_VFORK) {
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
return nr;
}
@@ -2225,9 +2275,11 @@ void __init proc_caches_init(void)
* maximum number of CPU's we can ever have. The cpumask_allocation
* is at the end of the structure, exactly for that reason.
*/
- mm_cachep = kmem_cache_create("mm_struct",
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
diff --git a/kernel/futex.c b/kernel/futex.c
index 57d0b3657e16..1f450e092c74 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -862,24 +862,6 @@ static void put_pi_state(struct futex_pi_state *pi_state)
}
}
-/*
- * Look up the task based on what TID userspace gave us.
- * We dont trust it.
- */
-static struct task_struct *futex_find_get_task(pid_t pid)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = find_task_by_vpid(pid);
- if (p)
- get_task_struct(p);
-
- rcu_read_unlock();
-
- return p;
-}
-
#ifdef CONFIG_FUTEX_PI
/*
@@ -1183,7 +1165,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
*/
if (!pid)
return -ESRCH;
- p = futex_find_get_task(pid);
+ p = find_get_task_by_vpid(pid);
if (!p)
return -ESRCH;
@@ -1878,6 +1860,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
/*
* When PI not supported: return -ENOSYS if requeue_pi is true,
* consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2279,33 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;
+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
*
- * We have to replace the newowner TID in the user space variable.
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2318,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
@@ -2434,9 +2457,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2467,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
+ /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
return gid_gt(a, b) - gid_lt(a, b);
}
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
{
sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
gid_cmp, NULL);
}
+EXPORT_SYMBOL(groups_sort);
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 89e355866450..6fc87ccda1d7 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
-config IRQ_DOMAIN_DEBUG
- bool "Expose hardware/virtual IRQ mapping via debugfs"
- depends on IRQ_DOMAIN && DEBUG_FS
- help
- This option will show the mapping relationship between hardware irq
- numbers and Linux irq numbers. The mapping is exposed via debugfs
- in the file "irq_domain_mapping".
-
- If you don't know what this means you don't need it.
-
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
}
}
-static cpumask_var_t *alloc_node_to_present_cpumask(void)
+static cpumask_var_t *alloc_node_to_possible_cpumask(void)
{
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ out_unwind:
return NULL;
}
-static void free_node_to_present_cpumask(cpumask_var_t *masks)
+static void free_node_to_possible_cpumask(cpumask_var_t *masks)
{
int node;
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
kfree(masks);
}
-static void build_node_to_present_cpumask(cpumask_var_t *masks)
+static void build_node_to_possible_cpumask(cpumask_var_t *masks)
{
int cpu;
- for_each_present_cpu(cpu)
+ for_each_possible_cpu(cpu)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
- if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
+ if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
struct cpumask *masks;
- cpumask_var_t nmsk, *node_to_present_cpumask;
+ cpumask_var_t nmsk, *node_to_possible_cpumask;
/*
* If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (!masks)
goto out;
- node_to_present_cpumask = alloc_node_to_present_cpumask();
- if (!node_to_present_cpumask)
+ node_to_possible_cpumask = alloc_node_to_possible_cpumask();
+ if (!node_to_possible_cpumask)
goto out;
/* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
- build_node_to_present_cpumask(node_to_present_cpumask);
- nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+ build_node_to_possible_cpumask(node_to_possible_cpumask);
+ nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
&nodemsk);
/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (affv <= nodes) {
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
- node_to_present_cpumask[n]);
+ node_to_possible_cpumask[n]);
if (++curvec == last_affv)
break;
}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
+ cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ done:
/* Fill out vectors at the end that don't need affinity */
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
- free_node_to_present_cpumask(node_to_present_cpumask);
+ free_node_to_possible_cpumask(node_to_possible_cpumask);
out:
free_cpumask_var(nmsk);
return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
return 0;
get_online_cpus();
- ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
+ ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
put_online_cpus();
return ret;
}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 4e8089b319ae..8c82ea26e837 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -71,7 +71,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
- if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
+ if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 043bfc35b353..c69357a43849 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc)
return 0;
}
-void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+int irq_activate_and_startup(struct irq_desc *desc, bool resend)
{
if (WARN_ON(irq_activate(desc)))
- return;
- irq_startup(desc, resend, IRQ_START_FORCE);
+ return 0;
+ return irq_startup(desc, resend, IRQ_START_FORCE);
}
static void __irq_disable(struct irq_desc *desc, bool mask);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..8ccb326d2977 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -3,8 +3,6 @@
* Debugging printout:
*/
-#include <linux/kallsyms.h>
-
#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
/* FIXME */
@@ -12,16 +10,21 @@
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
- printk("->handle_irq(): %p, ", desc->handle_irq);
- print_symbol("%s\n", (unsigned long)desc->handle_irq);
- printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
- print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+ printk("->handle_irq(): %p, %pS\n",
+ desc->handle_irq, desc->handle_irq);
+ printk("->irq_data.chip(): %p, %pS\n",
+ desc->irq_data.chip, desc->irq_data.chip);
printk("->action(): %p\n", desc->action);
if (desc->action) {
- printk("->action->handler(): %p, ", desc->action->handler);
- print_symbol("%s\n", (unsigned long)desc->action->handler);
+ printk("->action->handler(): %p, %pS\n",
+ desc->action->handler, desc->action->handler);
}
___P(IRQ_LEVEL);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+ BIT_MASK_DESCR(IRQD_CAN_RESERVE),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
/*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
*/
static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
set_bit(idx, &gc->installed);
if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(virq, &irq_nested_lock_class);
+ irq_set_lockdep_class(virq, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (chip->irq_calc_mask)
chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
continue;
if (flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(i, &irq_nested_lock_class);
+ irq_set_lockdep_class(i, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (!(flags & IRQ_GC_NO_MASK)) {
struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ca6afa267070 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc);
#define IRQ_START_COND false
extern int irq_activate(struct irq_desc *desc);
-extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
+extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
{
irqd_set_activated(data);
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..82b8b18ee1eb 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_find_mapping);
-#ifdef CONFIG_IRQ_DOMAIN_DEBUG
-static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
-{
- struct irq_domain *domain;
- struct irq_data *data;
-
- domain = desc->irq_data.domain;
- data = &desc->irq_data;
-
- while (domain) {
- unsigned int irq = data->irq;
- unsigned long hwirq = data->hwirq;
- struct irq_chip *chip;
- bool direct;
-
- if (data == &desc->irq_data)
- seq_printf(m, "%5d ", irq);
- else
- seq_printf(m, "%5d+ ", irq);
- seq_printf(m, "0x%05lx ", hwirq);
-
- chip = irq_data_get_irq_chip(data);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", domain->name);
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- domain = domain->parent;
- data = data->parent_data;
-#else
- domain = NULL;
-#endif
- }
-}
-
-static int virq_debug_show(struct seq_file *m, void *private)
-{
- unsigned long flags;
- struct irq_desc *desc;
- struct irq_domain *domain;
- struct radix_tree_iter iter;
- void __rcu **slot;
- int i;
-
- seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
- "name", "mapped", "linear-max", "direct-max", "devtree-node");
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(domain, &irq_domain_list, link) {
- struct device_node *of_node;
- const char *name;
-
- int count = 0;
-
- of_node = irq_domain_get_of_node(domain);
- if (of_node)
- name = of_node_full_name(of_node);
- else if (is_fwnode_irqchip(domain->fwnode))
- name = container_of(domain->fwnode, struct irqchip_fwid,
- fwnode)->name;
- else
- name = "";
-
- radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
- count++;
- seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
- domain == irq_default_domain ? '*' : ' ', domain->name,
- domain->revmap_size + count, domain->revmap_size,
- domain->revmap_direct_max_irq,
- name);
- }
- mutex_unlock(&irq_domain_mutex);
-
- seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
- "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
- "active", "type", "domain");
-
- for (i = 1; i < nr_irqs; i++) {
- desc = irq_to_desc(i);
- if (!desc)
- continue;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- virq_debug_show_one(m, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
-
- return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
- .open = virq_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
- if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
- NULL, &virq_debug_fops) == NULL)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
*
@@ -1693,7 +1575,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
}
}
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
{
int ret = 0;
@@ -1702,9 +1584,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
if (irqd->parent_data)
ret = __irq_domain_activate_irq(irqd->parent_data,
- early);
+ reserve);
if (!ret && domain->ops->activate) {
- ret = domain->ops->activate(domain, irqd, early);
+ ret = domain->ops->activate(domain, irqd, reserve);
/* Rollback in case of error */
if (ret && irqd->parent_data)
__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1598,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
- * @irq_data: outermost irq_data associated with interrupt
+ * @irq_data: Outermost irq_data associated with interrupt
+ * @reserve: If set only reserve an interrupt vector instead of assigning one
*
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
{
int ret = 0;
if (!irqd_is_activated(irq_data))
- ret = __irq_domain_activate_irq(irq_data, early);
+ ret = __irq_domain_activate_irq(irq_data, reserve);
if (!ret)
irqd_set_activated(irq_data);
return ret;
@@ -1843,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p)
irq_domain_debug_show_one(m, d, 0);
return 0;
}
-
-static int irq_domain_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_domain_debug_show, inode->i_private);
-}
-
-static const struct file_operations dfs_domain_ops = {
- .open = irq_domain_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
if (!d->name || !domain_dir || d->debugfs_file)
return;
d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &dfs_domain_ops);
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
@@ -1877,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root)
if (!domain_dir)
return;
- debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops);
+ debugfs_create_file("default", 0444, domain_dir, NULL,
+ &irq_domain_debug_fops);
mutex_lock(&irq_domain_mutex);
list_for_each_entry(d, &irq_domain_list, link)
debugfs_add_domain_dir(d);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 0ba0dd8863a7..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
bool reserved, unsigned int *mapped_cpu)
{
- unsigned int cpu;
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+ unsigned int bit;
+ best_cpu = UINT_MAX;
for_each_cpu(cpu, msk) {
- struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- unsigned int bit;
+ cm = per_cpu_ptr(m->maps, cpu);
- if (!cm->online)
+ if (!cm->online || cm->available <= maxavl)
continue;
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+
+ if (maxavl) {
+ cm = per_cpu_ptr(m->maps, best_cpu);
bit = matrix_alloc_area(m, cm, 1, false);
if (bit < m->alloc_end) {
cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
m->global_available--;
if (reserved)
m->global_reserved--;
- *mapped_cpu = cpu;
- trace_irq_matrix_alloc(bit, cpu, m, cm);
+ *mapped_cpu = best_cpu;
+ trace_irq_matrix_alloc(bit, best_cpu, m, cm);
return bit;
}
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ struct msi_desc *desc;
+
+ if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ return false;
+
+ if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ return false;
+
+ /*
+ * Checking the first MSI descriptor is sufficient. MSIX supports
+ * masking and MSI does so when the maskbit is set.
+ */
+ desc = first_msi_entry(dev);
+ return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+}
+
/**
* msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- msi_alloc_info_t arg;
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ msi_alloc_info_t arg;
int i, ret, virq;
+ bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
if (ops->msi_finish)
ops->msi_finish(&arg, 0);
+ can_reserve = msi_check_reservation_mode(domain, info, dev);
+
for_each_msi_entry(desc, dev) {
virq = desc->irq;
if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* the MSI entries before the PCI layer enables MSI in the
* card. Otherwise the card latches a random msi message.
*/
- if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
- struct irq_data *irq_data;
+ if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
+ continue;
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ if (!can_reserve)
+ irqd_clr_can_reserve(irq_data);
+ ret = irq_domain_activate_irq(irq_data, can_reserve);
+ if (ret)
+ goto cleanup;
+ }
+
+ /*
+ * If these interrupts use reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (can_reserve) {
+ for_each_msi_entry(desc, dev) {
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- ret = irq_domain_activate_irq(irq_data, true);
- if (ret)
- goto cleanup;
- if (info->flags & MSI_FLAG_MUST_REACTIVATE)
- irqd_clr_activated(irq_data);
+ irqd_clr_activated(irq_data);
}
}
return 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index ef2a47e0eab6..6cdecc6f4c53 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,7 +10,6 @@
#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
#include <linux/timer.h>
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 40e9d739c169..6b7cdf17ccf8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work)
*/
flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- nflags = flags | IRQ_WORK_FLAGS;
+ nflags = flags | IRQ_WORK_CLAIMED;
oflags = cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
unsigned long rate_limit,
struct delayed_work *work)
{
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
struct delayed_work *work)
{
cpus_read_lock();
- static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index d5fa4116688a..a23e21ada81b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -12,7 +12,6 @@
* compression (see scripts/kallsyms.c for a more complete description)
*/
#include <linux/kallsyms.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
@@ -20,15 +19,12 @@
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
-#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
#include <linux/compiler.h>
-#include <asm/sections.h>
-
/*
* These will be re-linked against their real values
* during the second link stage.
@@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak;
extern const unsigned long kallsyms_markers[] __weak;
-static inline int is_kernel_inittext(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext
- && addr <= (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
-static inline int is_kernel_text(unsigned long addr)
-{
- if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
- arch_is_kernel_text(addr))
- return 1;
- return in_gate_area_no_mm(addr);
-}
-
-static inline int is_kernel(unsigned long addr)
-{
- if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
- return 1;
- return in_gate_area_no_mm(addr);
-}
-
-static int is_ksym_addr(unsigned long addr)
-{
- if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
- return is_kernel(addr);
-
- return is_kernel_text(addr) || is_kernel_inittext(addr);
-}
-
/*
* Expand a compressed symbol data into the resulting uncompressed string,
* if uncompressed string is too long (>= maxlen), it will be truncated,
@@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address)
return __sprint_symbol(buffer, address, -1, 1);
}
-/* Look up a kernel symbol and print it to the kernel messages. */
-void __print_symbol(const char *fmt, unsigned long address)
-{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
-
- printk(fmt, buffer);
-}
-EXPORT_SYMBOL(__print_symbol);
-
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 15f33faf4013..2c16f1ab5e10 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
-void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
{
write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
}
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
-void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
{
write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
_RET_IP_);
@@ -358,7 +358,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
*/
if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
- if (kcov->t != NULL)
+ t = current;
+ if (kcov->t != NULL || t->kcov != NULL)
return -EBUSY;
if (arg == KCOV_TRACE_PC)
kcov->mode = KCOV_MODE_TRACE_PC;
@@ -370,7 +371,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
#endif
else
return -EINVAL;
- t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da2ccf142358..102160ff5c66 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -978,67 +978,90 @@ static int prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static void arm_kprobe_ftrace(struct kprobe *p)
+static int arm_kprobe_ftrace(struct kprobe *p)
{
- int ret;
+ int ret = 0;
ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
(unsigned long)p->addr, 0, 0);
- WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
- kprobe_ftrace_enabled++;
- if (kprobe_ftrace_enabled == 1) {
+ if (ret) {
+ pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+ return ret;
+ }
+
+ if (kprobe_ftrace_enabled == 0) {
ret = register_ftrace_function(&kprobe_ftrace_ops);
- WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ if (ret) {
+ pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
+ goto err_ftrace;
+ }
}
+
+ kprobe_ftrace_enabled++;
+ return ret;
+
+err_ftrace:
+ /*
+ * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
+ * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
+ * empty filter_hash which would undesirably trace all functions.
+ */
+ ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
+ return ret;
}
/* Caller must lock kprobe_mutex */
-static void disarm_kprobe_ftrace(struct kprobe *p)
+static int disarm_kprobe_ftrace(struct kprobe *p)
{
- int ret;
+ int ret = 0;
- kprobe_ftrace_enabled--;
- if (kprobe_ftrace_enabled == 0) {
+ if (kprobe_ftrace_enabled == 1) {
ret = unregister_ftrace_function(&kprobe_ftrace_ops);
- WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
+ return ret;
}
+
+ kprobe_ftrace_enabled--;
+
ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
(unsigned long)p->addr, 1, 0);
WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+ return ret;
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) do {} while (0)
-#define disarm_kprobe_ftrace(p) do {} while (0)
+#define arm_kprobe_ftrace(p) (-ENODEV)
+#define disarm_kprobe_ftrace(p) (-ENODEV)
#endif
/* Arm a kprobe with text_mutex */
-static void arm_kprobe(struct kprobe *kp)
+static int arm_kprobe(struct kprobe *kp)
{
- if (unlikely(kprobe_ftrace(kp))) {
- arm_kprobe_ftrace(kp);
- return;
- }
+ if (unlikely(kprobe_ftrace(kp)))
+ return arm_kprobe_ftrace(kp);
+
cpus_read_lock();
mutex_lock(&text_mutex);
__arm_kprobe(kp);
mutex_unlock(&text_mutex);
cpus_read_unlock();
+
+ return 0;
}
/* Disarm a kprobe with text_mutex */
-static void disarm_kprobe(struct kprobe *kp, bool reopt)
+static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
- if (unlikely(kprobe_ftrace(kp))) {
- disarm_kprobe_ftrace(kp);
- return;
- }
+ if (unlikely(kprobe_ftrace(kp)))
+ return disarm_kprobe_ftrace(kp);
cpus_read_lock();
mutex_lock(&text_mutex);
__disarm_kprobe(kp, reopt);
mutex_unlock(&text_mutex);
cpus_read_unlock();
+
+ return 0;
}
/*
@@ -1362,9 +1385,15 @@ out:
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
- if (!kprobes_all_disarmed)
+ if (!kprobes_all_disarmed) {
/* Arm the breakpoint again. */
- arm_kprobe(ap);
+ ret = arm_kprobe(ap);
+ if (ret) {
+ ap->flags |= KPROBE_FLAG_DISABLED;
+ list_del_rcu(&p->list);
+ synchronize_sched();
+ }
+ }
}
return ret;
}
@@ -1573,8 +1602,14 @@ int register_kprobe(struct kprobe *p)
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (!kprobes_all_disarmed && !kprobe_disabled(p))
- arm_kprobe(p);
+ if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
+ ret = arm_kprobe(p);
+ if (ret) {
+ hlist_del_rcu(&p->hlist);
+ synchronize_sched();
+ goto out;
+ }
+ }
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
@@ -1608,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
+ int ret;
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
- return NULL;
+ return ERR_PTR(-EINVAL);
if (!kprobe_disabled(p)) {
/* Disable probe if it is a child probe */
@@ -1626,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
* should have already been disarmed, so
* skip unneed disarming process.
*/
- if (!kprobes_all_disarmed)
- disarm_kprobe(orig_p, true);
+ if (!kprobes_all_disarmed) {
+ ret = disarm_kprobe(orig_p, true);
+ if (ret) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ return ERR_PTR(ret);
+ }
+ }
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
@@ -1644,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p)
/* Disable kprobe. This will disarm it if needed. */
ap = __disable_kprobe(p);
- if (ap == NULL)
- return -EINVAL;
+ if (IS_ERR(ap))
+ return PTR_ERR(ap);
if (ap == p)
/*
@@ -2078,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p)
int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
+ struct kprobe *p;
mutex_lock(&kprobe_mutex);
/* Disable this kprobe */
- if (__disable_kprobe(kp) == NULL)
- ret = -EINVAL;
+ p = __disable_kprobe(kp);
+ if (IS_ERR(p))
+ ret = PTR_ERR(p);
mutex_unlock(&kprobe_mutex);
return ret;
@@ -2116,7 +2159,9 @@ int enable_kprobe(struct kprobe *kp)
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
- arm_kprobe(p);
+ ret = arm_kprobe(p);
+ if (ret)
+ p->flags |= KPROBE_FLAG_DISABLED;
}
out:
mutex_unlock(&kprobe_mutex);
@@ -2407,11 +2452,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
.release = seq_release,
};
-static void arm_all_kprobes(void)
+static int arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
- unsigned int i;
+ unsigned int i, total = 0, errors = 0;
+ int err, ret = 0;
mutex_lock(&kprobe_mutex);
@@ -2428,46 +2474,74 @@ static void arm_all_kprobes(void)
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
- if (!kprobe_disabled(p))
- arm_kprobe(p);
+ /* Arm all kprobes on a best-effort basis */
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ if (!kprobe_disabled(p)) {
+ err = arm_kprobe(p);
+ if (err) {
+ errors++;
+ ret = err;
+ }
+ total++;
+ }
+ }
}
- printk(KERN_INFO "Kprobes globally enabled\n");
+ if (errors)
+ pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
+ errors, total);
+ else
+ pr_info("Kprobes globally enabled\n");
already_enabled:
mutex_unlock(&kprobe_mutex);
- return;
+ return ret;
}
-static void disarm_all_kprobes(void)
+static int disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
- unsigned int i;
+ unsigned int i, total = 0, errors = 0;
+ int err, ret = 0;
mutex_lock(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
if (kprobes_all_disarmed) {
mutex_unlock(&kprobe_mutex);
- return;
+ return 0;
}
kprobes_all_disarmed = true;
- printk(KERN_INFO "Kprobes globally disabled\n");
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
+ /* Disarm all kprobes on a best-effort basis */
hlist_for_each_entry_rcu(p, head, hlist) {
- if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- disarm_kprobe(p, false);
+ if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
+ err = disarm_kprobe(p, false);
+ if (err) {
+ errors++;
+ ret = err;
+ }
+ total++;
+ }
}
}
+
+ if (errors)
+ pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
+ errors, total);
+ else
+ pr_info("Kprobes globally disabled\n");
+
mutex_unlock(&kprobe_mutex);
/* Wait for disarming all kprobes by optimizer */
wait_for_kprobe_optimizer();
+
+ return ret;
}
/*
@@ -2494,6 +2568,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
{
char buf[32];
size_t buf_size;
+ int ret = 0;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -2504,17 +2579,20 @@ static ssize_t write_enabled_file_bool(struct file *file,
case 'y':
case 'Y':
case '1':
- arm_all_kprobes();
+ ret = arm_all_kprobes();
break;
case 'n':
case 'N':
case '0':
- disarm_all_kprobes();
+ ret = disarm_all_kprobes();
break;
default:
return -EINVAL;
}
+ if (ret)
+ return ret;
+
return count;
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index de9e45dca70f..3a4656fb7047 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
/*
* A reference is taken on the patch module to prevent it from being
* unloaded.
- *
- * Note: For immediate (no consistency model) patches we don't allow
- * patch modules to unload since there is no safe/sane method to
- * determine if a thread is still running in the patched code contained
- * in the patch module once the ftrace registration is successful.
*/
if (!try_module_get(patch->mod))
return -ENODEV;
@@ -454,6 +449,8 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
+ * /sys/kernel/livepatch/<patch>/signal
+ * /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
@@ -528,11 +525,73 @@ static ssize_t transition_show(struct kobject *kobj,
patch == klp_transition_patch);
}
+static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ bool val;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return count;
+
+ mutex_lock(&klp_mutex);
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ if (patch != klp_transition_patch) {
+ mutex_unlock(&klp_mutex);
+ return -EINVAL;
+ }
+
+ klp_send_signals();
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+}
+
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ bool val;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return count;
+
+ mutex_lock(&klp_mutex);
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ if (patch != klp_transition_patch) {
+ mutex_unlock(&klp_mutex);
+ return -EINVAL;
+ }
+
+ klp_force_transition();
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+}
+
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
+static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
+static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
+ &signal_kobj_attr.attr,
+ &force_kobj_attr.attr,
NULL
};
@@ -830,12 +889,7 @@ int klp_register_patch(struct klp_patch *patch)
if (!klp_initialized())
return -ENODEV;
- /*
- * Architectures without reliable stack traces have to set
- * patch->immediate because there's currently no way to patch kthreads
- * with the consistency model.
- */
- if (!klp_have_reliable_stack() && !patch->immediate) {
+ if (!klp_have_reliable_stack()) {
pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
return -ENOSYS;
}
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 56add6327736..7c6631e693bc 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch;
static int klp_target_state = KLP_UNDEFINED;
+static bool klp_forced = false;
+
/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
@@ -80,7 +82,6 @@ static void klp_complete_transition(void)
struct klp_func *func;
struct task_struct *g, *task;
unsigned int cpu;
- bool immediate_func = false;
pr_debug("'%s': completing %s transition\n",
klp_transition_patch->mod->name,
@@ -102,16 +103,9 @@ static void klp_complete_transition(void)
klp_synchronize_transition();
}
- if (klp_transition_patch->immediate)
- goto done;
-
- klp_for_each_object(klp_transition_patch, obj) {
- klp_for_each_func(obj, func) {
+ klp_for_each_object(klp_transition_patch, obj)
+ klp_for_each_func(obj, func)
func->transition = false;
- if (func->immediate)
- immediate_func = true;
- }
- }
/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
if (klp_target_state == KLP_PATCHED)
@@ -130,7 +124,6 @@ static void klp_complete_transition(void)
task->patch_state = KLP_UNDEFINED;
}
-done:
klp_for_each_object(klp_transition_patch, obj) {
if (!klp_is_object_loaded(obj))
continue;
@@ -144,13 +137,11 @@ done:
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
- * See complementary comment in __klp_enable_patch() for why we
- * keep the module reference for immediate patches.
+ * klp_forced set implies unbounded increase of module's ref count if
+ * the module is disabled/enabled in a loop.
*/
- if (!klp_transition_patch->immediate && !immediate_func &&
- klp_target_state == KLP_UNPATCHED) {
+ if (!klp_forced && klp_target_state == KLP_UNPATCHED)
module_put(klp_transition_patch->mod);
- }
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
@@ -218,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func,
struct klp_ops *ops;
int i;
- if (func->immediate)
- return 0;
-
for (i = 0; i < trace->nr_entries; i++) {
address = trace->entries[i];
@@ -383,13 +371,6 @@ void klp_try_complete_transition(void)
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
/*
- * If the patch can be applied or reverted immediately, skip the
- * per-task transitions.
- */
- if (klp_transition_patch->immediate)
- goto success;
-
- /*
* Try to switch the tasks to the target patch state by walking their
* stacks and looking for any to-be-patched or to-be-unpatched
* functions. If such functions are found on a stack, or if the stack
@@ -432,7 +413,6 @@ void klp_try_complete_transition(void)
return;
}
-success:
/* we're done, now cleanup the data structures */
klp_complete_transition();
}
@@ -453,13 +433,6 @@ void klp_start_transition(void)
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
- * If the patch can be applied or reverted immediately, skip the
- * per-task transitions.
- */
- if (klp_transition_patch->immediate)
- return;
-
- /*
* Mark all normal tasks as needing a patch state update. They'll
* switch either in klp_try_complete_transition() or as they exit the
* kernel.
@@ -509,13 +482,6 @@ void klp_init_transition(struct klp_patch *patch, int state)
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
/*
- * If the patch can be applied or reverted immediately, skip the
- * per-task transitions.
- */
- if (patch->immediate)
- return;
-
- /*
* Initialize all tasks to the initial patch state to prepare them for
* switching to the target state.
*/
@@ -608,3 +574,71 @@ void klp_copy_process(struct task_struct *child)
/* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
}
+
+/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
+ * action currently.
+ */
+void klp_send_signals(void)
+{
+ struct task_struct *g, *task;
+
+ pr_notice("signaling remaining tasks\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ if (!klp_patch_pending(task))
+ continue;
+
+ /*
+ * There is a small race here. We could see TIF_PATCH_PENDING
+ * set and decide to wake up a kthread or send a fake signal.
+ * Meanwhile the task could migrate itself and the action
+ * would be meaningless. It is not serious though.
+ */
+ if (task->flags & PF_KTHREAD) {
+ /*
+ * Wake up a kthread which sleeps interruptedly and
+ * still has not been migrated.
+ */
+ wake_up_state(task, TASK_INTERRUPTIBLE);
+ } else {
+ /*
+ * Send fake signal to all non-kthread tasks which are
+ * still not migrated.
+ */
+ spin_lock_irq(&task->sighand->siglock);
+ signal_wake_up(task, 0);
+ spin_unlock_irq(&task->sighand->siglock);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
+ * existing transition to finish.
+ *
+ * NOTE: klp_update_patch_state(task) requires the task to be inactive or
+ * 'current'. This is not the case here and the consistency model could be
+ * broken. Administrator, who is the only one to execute the
+ * klp_force_transitions(), has to be aware of this.
+ */
+void klp_force_transition(void)
+{
+ struct task_struct *g, *task;
+ unsigned int cpu;
+
+ pr_warn("forcing remaining tasks to the patched state\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ klp_update_patch_state(task);
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu)
+ klp_update_patch_state(idle_task(cpu));
+
+ klp_forced = true;
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index 0f6e27c481f9..f9d0bc016067 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,5 +11,7 @@ void klp_cancel_transition(void);
void klp_start_transition(void);
void klp_try_complete_transition(void);
void klp_reverse_transition(void);
+void klp_send_signals(void);
+void klp_force_transition(void);
#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 670d8d7d8087..89b5f83f1969 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -49,6 +49,7 @@
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
@@ -57,10 +58,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
-
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
- crossrelease_fullstack = 1;
- return 0;
-}
-
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
-
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -664,18 +648,12 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
static inline struct lock_class *
-look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
- bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
@@ -688,24 +666,11 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
/*
- * Static locks do not have their class-keys yet - for them the key
- * is the lock object itself. If the lock is in the per cpu area,
- * the canonical address of the lock (per cpu offset removed) is
- * used.
+ * If it is not initialised then it has never been locked,
+ * so it won't be present in the hash table.
*/
- if (unlikely(!lock->key)) {
- unsigned long can_addr, addr = (unsigned long)lock;
-
- if (__is_kernel_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (__is_module_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (static_obj(lock))
- lock->key = (void *)lock;
- else
- return ERR_PTR(-EINVAL);
- is_static = true;
- }
+ if (unlikely(!lock->key))
+ return NULL;
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -737,20 +702,36 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
}
- return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
+ return NULL;
}
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
+/*
+ * Static locks do not have their class-keys yet - for them the key is
+ * the lock object itself. If the lock is in the per cpu area, the
+ * canonical address of the lock (per cpu offset removed) is used.
+ */
+static bool assign_lock_key(struct lockdep_map *lock)
+{
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else {
+ /* Debug-check: all keys must be persistent! */
+ debug_locks_off();
+ pr_err("INFO: trying to register non-static key.\n");
+ pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("turning off the locking correctness validator.\n");
+ dump_stack();
+ return false;
+ }
+
+ return true;
+}
/*
* Register a lock's class in the hash-table, if the class is not present
@@ -767,18 +748,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(!IS_ERR_OR_NULL(class)))
+ if (likely(class))
goto out_set_class_cache;
- /*
- * Debug-check: all keys must be persistent!
- */
- if (IS_ERR(class)) {
- debug_locks_off();
- printk("INFO: trying to register non-static key.\n");
- printk("the code is fine but needs lockdep annotation.\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
+ if (!lock->key) {
+ if (!assign_lock_key(lock))
+ return NULL;
+ } else if (!static_obj(lock->key)) {
return NULL;
}
@@ -1151,41 +1127,22 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- if (cross_lock(tgt->instance)) {
- printk(" Possible unsafe locking scenario by crosslock:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk(" unlock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- } else {
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- }
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
}
/*
@@ -1211,10 +1168,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- if (cross_lock(check_tgt->instance))
- pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
- else
- pr_warn("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1198,7 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (cross_lock(check_tgt->instance))
- this->trace = *trace;
- else if (!save_trace(&this->trace))
+ if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1850,9 +1802,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- if (cross_lock(prev->instance))
- continue;
-
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -2018,31 +1967,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
+
/*
- * Only non-crosslock entries get new dependencies added.
- * Crosslock entries will be added by commit later:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!cross_lock(hlock->instance)) {
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ if (!ret)
+ return 0;
+
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
*/
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next,
- distance, &trace, save_trace);
- if (!ret)
- return 0;
-
- /*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
- */
- if (!hlock->trylock)
- break;
- }
+ if (!hlock->trylock)
+ break;
}
+
depth--;
/*
* End of lock-stack?
@@ -3292,21 +3236,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- cross_init(lock, 0);
__lockdep_init_map(lock, name, key, subclass);
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
-{
- cross_init(lock, 1);
- __lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
-
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3344,7 +3277,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
}
-static int __lock_is_held(struct lockdep_map *lock, int read);
+static int __lock_is_held(const struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3362,7 +3295,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
- int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3411,8 +3343,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- /* TODO: nest_lock is not implemented for crosslock yet. */
- if (depth && !cross_lock(lock)) {
+ if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3500,14 +3431,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
- ret = lock_acquire_crosslock(hlock);
- /*
- * 2 means normal acquire operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3563,13 +3486,14 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
}
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
if (hlock->references) {
- struct lock_class *class = lock->class_cache[0];
+ const struct lock_class *class = lock->class_cache[0];
if (!class)
class = look_up_lock_class(lock, 0);
@@ -3580,7 +3504,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (IS_ERR_OR_NULL(class))
+ if (!class)
return 0;
/*
@@ -3745,19 +3669,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int ret, i;
+ int i;
if (unlikely(!debug_locks))
return 0;
- ret = lock_release_crosslock(lock);
- /*
- * 2 means normal release operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -3813,7 +3729,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
return 1;
}
-static int __lock_is_held(struct lockdep_map *lock, int read)
+static int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -4027,7 +3943,7 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held_type(struct lockdep_map *lock, int read)
+int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
@@ -4384,7 +4300,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (!IS_ERR_OR_NULL(class))
+ if (class)
zap_class(class);
}
/*
@@ -4580,6 +4496,7 @@ retry:
if (!unlock)
if (read_trylock(&tasklist_lock))
unlock = 1;
+ touch_nmi_watchdog();
} while_each_thread(g, p);
pr_warn("\n");
@@ -4675,495 +4592,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A Task-B
- *
- * mutex_lock(&A);
- * mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- * lock_acquire_crosslock();
- * atomic_inc_return(&cross_gen_id);
- * |
- * | mutex_lock(&B);
- * | mutex_unlock(&B);
- * |
- * | complete(&C);
- * `-- lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-
-#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
- /*
- * Normally, xhlock->hlock.instance must be !NULL.
- */
- xhlock->hlock.instance = NULL;
-}
-
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- * HARD(IRQ)
- * SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ. So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (!cur->xhlocks)
- return;
-
- cur->xhlock_idx_hist[c] = cur->xhlock_idx;
- cur->hist_id_save[c] = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (cur->xhlocks) {
- unsigned int idx = cur->xhlock_idx_hist[c];
- struct hist_lock *h = &xhlock(idx);
-
- cur->xhlock_idx = idx;
-
- /* Check if the ring was overwritten. */
- if (h->hist_id != cur->hist_id_save[c])
- invalidate_xhlock(h);
- }
-}
-
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
- /*
- * We call this at an invariant point, no current state, no history.
- * Verify the former, enforce the latter.
- */
- WARN_ON_ONCE(!force && current->lockdep_depth);
- if (current->xhlocks)
- invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-
-static int cross_lock(struct lockdep_map *lock)
-{
- return lock ? lock->cross : 0;
-}
-
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
- return (int)(a - b) < 0;
-}
-
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
- return hlock_class(&xhlock->hlock);
-}
-
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
- return hlock_class(&xlock->hlock);
-}
-
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check;
-}
-
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- * 1. Has not used after initializaion yet.
- * 2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
- /*
- * xhlock->hlock.instance must be !NULL.
- */
- return !!xhlock->hlock.instance;
-}
-
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
- unsigned int idx = ++current->xhlock_idx;
- struct hist_lock *xhlock = &xhlock(idx);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * This can be done locklessly because they are all task-local
- * state, we must however ensure IRQs are disabled.
- */
- WARN_ON_ONCE(!irqs_disabled());
-#endif
-
- /* Initialize hist_lock's members */
- xhlock->hlock = *hlock;
- xhlock->hist_id = ++current->hist_id;
-
- xhlock->trace.nr_entries = 0;
- xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
- xhlock->trace.entries = xhlock->trace_entries;
-
- if (crossrelease_fullstack) {
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
- } else {
- xhlock->trace.nr_entries = 1;
- xhlock->trace.entries[0] = hlock->acquire_ip;
- }
-}
-
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
- return xhlock->hlock.irq_context == task_irq_context(current);
-}
-
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
- /*
- * Record a hist_lock, only in case that acquisitions ahead
- * could depend on the held_lock. For example, if the held_lock
- * is trylock then acquisitions ahead never depends on that.
- * In that case, we don't need to record it. Just return.
- */
- if (!current->xhlocks || !depend_before(hlock))
- return;
-
- add_xhlock(hlock);
-}
-
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
- struct cross_lock *xlock;
- unsigned int gen_id;
-
- if (!graph_lock())
- return 0;
-
- xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-
- /*
- * When acquisitions for a crosslock are overlapped, we use
- * nr_acquire to perform commit for them, based on cross_gen_id
- * of the first acquisition, which allows to add additional
- * dependencies.
- *
- * Moreover, when no acquisition of a crosslock is in progress,
- * we should not perform commit because the lock might not exist
- * any more, which might cause incorrect memory access. So we
- * have to track the number of acquisitions of a crosslock.
- *
- * depend_after() is necessary to initialize only the first
- * valid xlock so that the xlock can be used on its commit.
- */
- if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
- goto unlock;
-
- gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
- xlock->hlock = *hlock;
- xlock->hlock.gen_id = gen_id;
-unlock:
- graph_unlock();
- return 1;
-}
-
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
- /*
- * CONTEXT 1 CONTEXT 2
- * --------- ---------
- * lock A (cross)
- * X = atomic_inc_return(&cross_gen_id)
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * Y = atomic_read_acquire(&cross_gen_id)
- * lock B
- *
- * atomic_read_acquire() is for ordering between A and B,
- * IOW, A happens before B, when CONTEXT 2 see Y >= X.
- *
- * Pairs with atomic_inc_return() in add_xlock().
- */
- hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-
- if (cross_lock(hlock->instance))
- return add_xlock(hlock);
-
- check_add_xhlock(hlock);
- return 2;
-}
-
-static int copy_trace(struct stack_trace *trace)
-{
- unsigned long *buf = stack_trace + nr_stack_trace_entries;
- unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- unsigned int nr = min(max_nr, trace->nr_entries);
-
- trace->nr_entries = nr;
- memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
- trace->entries = buf;
- nr_stack_trace_entries += nr;
-
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
- if (!debug_locks_off_graph_unlock())
- return 0;
-
- print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
- dump_stack();
-
- return 0;
- }
-
- return 1;
-}
-
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
- unsigned int xid, pid;
- u64 chain_key;
-
- xid = xlock_class(xlock) - lock_classes;
- chain_key = iterate_chain_key((u64)0, xid);
- pid = xhlock_class(xhlock) - lock_classes;
- chain_key = iterate_chain_key(chain_key, pid);
-
- if (lookup_chain_cache(chain_key))
- return 1;
-
- if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
- chain_key))
- return 0;
-
- if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
- &xhlock->trace, copy_trace))
- return 0;
-
- return 1;
-}
-
-static void commit_xhlocks(struct cross_lock *xlock)
-{
- unsigned int cur = current->xhlock_idx;
- unsigned int prev_hist_id = xhlock(cur).hist_id;
- unsigned int i;
-
- if (!graph_lock())
- return;
-
- if (xlock->nr_acquire) {
- for (i = 0; i < MAX_XHLOCKS_NR; i++) {
- struct hist_lock *xhlock = &xhlock(cur - i);
-
- if (!xhlock_valid(xhlock))
- break;
-
- if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
- break;
-
- if (!same_context_xhlock(xhlock))
- break;
-
- /*
- * Filter out the cases where the ring buffer was
- * overwritten and the current entry has a bigger
- * hist_id than the previous one, which is impossible
- * otherwise:
- */
- if (unlikely(before(prev_hist_id, xhlock->hist_id)))
- break;
-
- prev_hist_id = xhlock->hist_id;
-
- /*
- * commit_xhlock() returns 0 with graph_lock already
- * released if fail.
- */
- if (!commit_xhlock(xlock, xhlock))
- return;
- }
- }
-
- graph_unlock();
-}
-
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
- struct cross_lock *xlock;
- unsigned long flags;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
- return;
-
- if (!current->xhlocks)
- return;
-
- /*
- * Do commit hist_locks with the cross_lock, only in case that
- * the cross_lock could depend on acquisitions after that.
- *
- * For example, if the cross_lock does not have the 'check' flag
- * then we don't need to check dependencies and commit for that.
- * Just skip it. In that case, of course, the cross_lock does
- * not depend on acquisitions ahead, either.
- *
- * WARNING: Don't do that in add_xlock() in advance. When an
- * acquisition context is different from the commit context,
- * invalid(skipped) cross_lock might be accessed.
- */
- if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- xlock = &((struct lockdep_map_cross *)lock)->xlock;
- commit_xhlocks(xlock);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-
-/*
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
- if (cross_lock(lock)) {
- if (!graph_lock())
- return 0;
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
- graph_unlock();
- return 1;
- }
- return 2;
-}
-
-static void cross_init(struct lockdep_map *lock, int cross)
-{
- if (cross)
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-
- lock->cross = cross;
-
- /*
- * Crossrelease assumes that the ring buffer size of xhlocks
- * is aligned with power of 2. So force it on build.
- */
- BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-
-void lockdep_init_task(struct task_struct *task)
-{
- int i;
-
- task->xhlock_idx = UINT_MAX;
- task->hist_id = 0;
-
- for (i = 0; i < XHLOCK_CTX_NR; i++) {
- task->xhlock_idx_hist[i] = UINT_MAX;
- task->hist_id_save[i] = 0;
- }
-
- task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
- GFP_KERNEL);
-}
-
-void lockdep_free_task(struct task_struct *task)
-{
- if (task->xhlocks) {
- void *tmp = task->xhlocks;
- /* Diable crossrelease for current */
- task->xhlocks = NULL;
- kfree(tmp);
- }
-}
-#endif
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f24582d4dad3..6850ffd69125 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -77,10 +77,6 @@ struct lock_stress_stats {
long n_lock_acquired;
};
-int torture_runnable = IS_ENABLED(MODULE);
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
-
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -130,10 +126,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_lock_busted_write_unlock(void)
@@ -179,10 +173,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
@@ -352,10 +344,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp)
mdelay(longdelay_ms * 5);
else
mdelay(longdelay_ms / 5);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_mutex_unlock(void) __releases(torture_mutex)
@@ -507,10 +497,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
@@ -547,10 +535,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp)
mdelay(longdelay_ms * 10);
else
mdelay(longdelay_ms / 10);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rwsem_up_write(void) __releases(torture_rwsem)
@@ -570,14 +556,12 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp)
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 2);
else
mdelay(longdelay_ms / 2);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rwsem_up_read(void) __releases(torture_rwsem)
@@ -715,8 +699,7 @@ static void __torture_print_stats(char *page,
{
bool fail = 0;
int i, n_stress;
- long max = 0;
- long min = statp[0].n_lock_acquired;
+ long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
@@ -823,7 +806,7 @@ static void lock_torture_cleanup(void)
* such, only perform the underlying torture-specific cleanups,
* and avoid anything related to locktorture.
*/
- if (!cxt.lwsa)
+ if (!cxt.lwsa && !cxt.lrsa)
goto end;
if (writer_tasks) {
@@ -879,7 +862,7 @@ static int __init lock_torture_init(void)
&percpu_rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -898,6 +881,13 @@ static int __init lock_torture_init(void)
firsterr = -EINVAL;
goto unwind;
}
+
+ if (nwriters_stress == 0 && nreaders_stress == 0) {
+ pr_alert("lock-torture: must run at least one locking thread\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+
if (cxt.cur_ops->init)
cxt.cur_ops->init();
@@ -921,17 +911,19 @@ static int __init lock_torture_init(void)
#endif
/* Initialize the statistics so that each run gets its own numbers. */
+ if (nwriters_stress) {
+ lock_is_write_held = 0;
+ cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
- lock_is_write_held = 0;
- cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
- if (cxt.lwsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < cxt.nrealwriters_stress; i++) {
- cxt.lwsa[i].n_lock_fail = 0;
- cxt.lwsa[i].n_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
}
if (cxt.cur_ops->readlock) {
@@ -948,19 +940,21 @@ static int __init lock_torture_init(void)
cxt.nrealreaders_stress = cxt.nrealwriters_stress;
}
- lock_is_read_held = 0;
- cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
- if (cxt.lrsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
- firsterr = -ENOMEM;
- kfree(cxt.lwsa);
- cxt.lwsa = NULL;
- goto unwind;
- }
-
- for (i = 0; i < cxt.nrealreaders_stress; i++) {
- cxt.lrsa[i].n_lock_fail = 0;
- cxt.lrsa[i].n_lock_acquired = 0;
+ if (nreaders_stress) {
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
}
}
@@ -990,12 +984,14 @@ static int __init lock_torture_init(void)
goto unwind;
}
- writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
- GFP_KERNEL);
- if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nwriters_stress) {
+ writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
if (cxt.cur_ops->readlock) {
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 294294c71ba4..d880296245c5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
* @tail : The new queue tail code word
* Return: The previous queue tail code word
*
- * xchg(lock, tail)
+ * xchg(lock, tail), which heads an address dependency
*
* p,*,* -> n,*,* ; prev = xchg(lock, node)
*/
@@ -379,6 +379,14 @@ queue:
tail = encode_tail(smp_processor_id(), idx);
node += idx;
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
node->locked = 0;
node->next = NULL;
pv_init_node(node);
@@ -408,16 +416,15 @@ queue:
*/
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
+
/*
- * The above xchg_tail() is also a load of @lock which generates,
- * through decode_tail(), a pointer.
- *
- * The address dependency matches the RELEASE of xchg_tail()
- * such that the access to @prev must happen after.
+ * We must ensure that the stores to @node are observed before
+ * the write to prev->next. The address dependency from
+ * xchg_tail is not sufficient to ensure this because the read
+ * component of xchg_tail is unordered with respect to the
+ * initialisation of @node.
*/
- smp_read_barrier_depends();
-
- WRITE_ONCE(prev->next, node);
+ smp_store_release(&prev->next, node);
pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
break; \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
} \
\
unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
local_irq_restore(flags); \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
+ \
return flags; \
} \
\
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 403ab9cdb949..4849be5f9b3c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-struct page_map {
- struct resource res;
- struct percpu_ref *ref;
- struct dev_pagemap pgmap;
- struct vmem_altmap altmap;
-};
-
static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
@@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma,
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
-static void pgmap_radix_release(struct resource *res)
+static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
{
unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- foreach_order_pgoff(res, order, pgoff)
+ foreach_order_pgoff(res, order, pgoff) {
+ if (pgoff >= end_pgoff)
+ break;
radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
+ }
mutex_unlock(&pgmap_lock);
synchronize_rcu();
}
-static unsigned long pfn_first(struct page_map *page_map)
+static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
- struct dev_pagemap *pgmap = &page_map->pgmap;
- const struct resource *res = &page_map->res;
- struct vmem_altmap *altmap = pgmap->altmap;
+ const struct resource *res = &pgmap->res;
+ struct vmem_altmap *altmap = &pgmap->altmap;
unsigned long pfn;
pfn = res->start >> PAGE_SHIFT;
- if (altmap)
+ if (pgmap->altmap_valid)
pfn += vmem_altmap_offset(altmap);
return pfn;
}
-static unsigned long pfn_end(struct page_map *page_map)
+static unsigned long pfn_end(struct dev_pagemap *pgmap)
{
- const struct resource *res = &page_map->res;
+ const struct resource *res = &pgmap->res;
return (res->start + resource_size(res)) >> PAGE_SHIFT;
}
@@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map)
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
-static void devm_memremap_pages_release(struct device *dev, void *data)
+static void devm_memremap_pages_release(void *data)
{
- struct page_map *page_map = data;
- struct resource *res = &page_map->res;
+ struct dev_pagemap *pgmap = data;
+ struct device *dev = pgmap->dev;
+ struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
- struct dev_pagemap *pgmap = &page_map->pgmap;
unsigned long pfn;
- for_each_device_pfn(pfn, page_map)
+ for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
if (percpu_ref_tryget_live(pgmap->ref)) {
@@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+ - align_start;
mem_hotplug_begin();
- arch_remove_memory(align_start, align_size);
+ arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
+ &pgmap->altmap : NULL);
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_radix_release(res);
- dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
- "%s: failed to free all reserved pages\n", __func__);
-}
-
-/* assumes rcu_read_lock() held at entry */
-struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
-{
- struct page_map *page_map;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
- return page_map ? &page_map->pgmap : NULL;
+ pgmap_radix_release(res, -1);
+ dev_WARN_ONCE(dev, pgmap->altmap.alloc,
+ "%s: failed to free all reserved pages\n", __func__);
}
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
- * @res: "host memory" address range
- * @ref: a live per-cpu reference count
- * @altmap: optional descriptor for allocating the memmap from @res
+ * @pgmap: pointer to a struct dev_pgmap
*
* Notes:
- * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- * (or devm release event). The expected order of events is that @ref has
+ * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
+ * by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case altmap_valid
+ * must be set to true
+ *
+ * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
+ * time (or devm release event). The expected order of events is that ref has
* been through percpu_ref_kill() before devm_memremap_pages_release(). The
* wait for the completion of all references being dropped and
* percpu_ref_exit() must occur after devm_memremap_pages_release().
*
- * 2/ @res is expected to be a host memory range that could feasibly be
+ * 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
* this is not enforced.
*/
-void *devm_memremap_pages(struct device *dev, struct resource *res,
- struct percpu_ref *ref, struct vmem_altmap *altmap)
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
resource_size_t align_start, align_size, align_end;
+ struct vmem_altmap *altmap = pgmap->altmap_valid ?
+ &pgmap->altmap : NULL;
unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
- struct dev_pagemap *pgmap;
- struct page_map *page_map;
int error, nid, is_ram, i = 0;
+ struct resource *res = &pgmap->res;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (!ref)
+ if (!pgmap->ref)
return ERR_PTR(-EINVAL);
- page_map = devres_alloc_node(devm_memremap_pages_release,
- sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
- if (!page_map)
- return ERR_PTR(-ENOMEM);
- pgmap = &page_map->pgmap;
-
- memcpy(&page_map->res, res, sizeof(*res));
-
pgmap->dev = dev;
- if (altmap) {
- memcpy(&page_map->altmap, altmap, sizeof(*altmap));
- pgmap->altmap = &page_map->altmap;
- }
- pgmap->ref = ref;
- pgmap->res = &page_map->res;
- pgmap->type = MEMORY_DEVICE_HOST;
- pgmap->page_fault = NULL;
- pgmap->page_free = NULL;
- pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
foreach_order_pgoff(res, order, pgoff) {
- struct dev_pagemap *dup;
-
- rcu_read_lock();
- dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
- rcu_read_unlock();
- if (dup) {
- dev_err(dev, "%s: %pr collides with mapping for %s\n",
- __func__, res, dev_name(dup->dev));
- error = -EBUSY;
- break;
- }
error = __radix_tree_insert(&pgmap_radix,
- PHYS_PFN(res->start) + pgoff, order, page_map);
+ PHYS_PFN(res->start) + pgoff, order, pgmap);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
@@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
goto err_pfn_remap;
mem_hotplug_begin();
- error = arch_add_memory(nid, align_start, align_size, false);
+ error = arch_add_memory(nid, align_start, align_size, altmap, false);
if (!error)
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT);
+ align_size >> PAGE_SHIFT, altmap);
mem_hotplug_done();
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, page_map) {
+ for_each_device_pfn(pfn, pgmap) {
struct page *page = pfn_to_page(pfn);
/*
@@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
*/
list_del(&page->lru);
page->pgmap = pgmap;
- percpu_ref_get(ref);
+ percpu_ref_get(pgmap->ref);
if (!(++i % 1024))
cond_resched();
}
- devres_add(dev, page_map);
+
+ devm_add_action(dev, devm_memremap_pages_release, pgmap);
+
return __va(res->start);
err_add_memory:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
err_radix:
- pgmap_radix_release(res);
- devres_free(page_map);
+ pgmap_radix_release(res, pgoff);
+ devres_free(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
@@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+ struct dev_pagemap *pgmap)
{
- /*
- * 'memmap_start' is the virtual address for the first "struct
- * page" in this range of the vmemmap array. In the case of
- * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
- * pointer arithmetic, so we can perform this to_vmem_altmap()
- * conversion without concern for the initialization state of
- * the struct page fields.
- */
- struct page *page = (struct page *) memmap_start;
- struct dev_pagemap *pgmap;
+ resource_size_t phys = PFN_PHYS(pfn);
/*
- * Unconditionally retrieve a dev_pagemap associated with the
- * given physical address, this is only for use in the
- * arch_{add|remove}_memory() for setting up and tearing down
- * the memmap.
+ * In the cached case we're already holding a live reference.
*/
+ if (pgmap) {
+ if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ return pgmap;
+ put_dev_pagemap(pgmap);
+ }
+
+ /* fall back to slow path lookup */
rcu_read_lock();
- pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+ if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ pgmap = NULL;
rcu_read_unlock();
- return pgmap ? pgmap->altmap : NULL;
+ return pgmap;
}
#endif /* CONFIG_ZONE_DEVICE */
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
void put_zone_device_private_or_public_page(struct page *page)
{
diff --git a/kernel/module.c b/kernel/module.c
index dea01ac9cb74..ad2d420024f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
}
#endif /* CONFIG_LIVEPATCH */
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
+ check_modinfo_retpoline(mod, info);
+
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
pr_warn("%s: module is from the staging directory, the quality "
@@ -3118,7 +3129,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->ftrace_callsites),
&mod->num_ftrace_callsites);
#endif
-
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ sizeof(*mod->ei_funcs),
+ &mod->num_ei_funcs);
+#endif
mod->extable = section_objs(info, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
@@ -3789,6 +3804,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_disable_nx(mod);
ddebug_cleanup:
+ ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
synchronize_sched();
kfree(mod->args);
@@ -3808,12 +3824,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
- /*
- * Ftrace needs to clean up what it initialized.
- * This does nothing if ftrace_module_init() wasn't called,
- * but it must be called outside of module_mutex.
- */
- ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
@@ -3938,6 +3948,12 @@ static const char *get_ksymbol(struct module *mod,
return symname(kallsyms, best);
}
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
/* For kallsyms to ask for address resolution. NULL means not found. Careful
* not to lock to avoid deadlock on oopses, simply disable preemption. */
const char *module_address_lookup(unsigned long addr,
diff --git a/kernel/padata.c b/kernel/padata.c
index 57c0074d50cc..d568cc56405f 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* padata.c - generic interface to process data streams in parallel
*
diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..ed6c343fe50d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,19 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
-struct pid init_struct_pid = INIT_STRUCT_PID;
+struct pid init_struct_pid = {
+ .count = ATOMIC_INIT(1),
+ .tasks = {
+ { .first = NULL },
+ { .first = NULL },
+ { .first = NULL },
+ },
+ .level = 0,
+ .numbers = { {
+ .nr = 0,
+ .ns = &init_pid_ns,
+ }, }
+};
int pid_max = PID_MAX_DEFAULT;
@@ -193,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns)) {
- disable_pid_allocation(ns);
+ if (pid_ns_prepare_proc(ns))
goto out_free;
- }
}
get_pid_ns(ns);
@@ -226,6 +236,10 @@ out_free:
while (++i <= ns->level)
idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ /* On failure to allocate the first pid, reset the state */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
@@ -329,6 +343,19 @@ struct task_struct *find_task_by_vpid(pid_t vnr)
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
+struct task_struct *find_get_task_by_vpid(pid_t nr)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = find_task_by_vpid(nr);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3a2ca9066583..705c2366dafe 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex);
#ifdef CONFIG_PM_SLEEP
+void lock_system_sleep(void)
+{
+ current->flags |= PF_FREEZER_SKIP;
+ mutex_lock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_system_sleep);
+
+void unlock_system_sleep(void)
+{
+ /*
+ * Don't use freezer_count() because we don't want the call to
+ * try_to_freeze() here.
+ *
+ * Reason:
+ * Fundamentally, we just don't need it, because freezing condition
+ * doesn't come into effect until we release the pm_mutex lock,
+ * since the freezer always works with pm_mutex held.
+ *
+ * More importantly, in the case of hibernation,
+ * unlock_system_sleep() gets called in snapshot_read() and
+ * snapshot_write() when the freezing condition is still in effect.
+ * Which means, if we use try_to_freeze() here, it would make them
+ * enter the refrigerator, thus causing hibernation to lockup.
+ */
+ current->flags &= ~PF_FREEZER_SKIP;
+ mutex_unlock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_system_sleep);
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f29cd178df90..9e58bdc8a562 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -104,9 +104,6 @@ extern int in_suspend;
extern dev_t swsusp_resume_device;
extern sector_t swsusp_resume_block;
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bce0464524d8..3d37c279c090 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
- * minus mapped file pages.
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
{
@@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
- + global_node_page_state(NR_INACTIVE_FILE)
- - global_node_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_INACTIVE_FILE);
return saveable <= size ? 0 : saveable - size;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 293ead59eccc..11b4282c2d20 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb)
static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- struct page *page = bio->bi_io_vec[0].bv_page;
+ struct page *page = bio_first_page_all(bio);
if (bio->bi_status) {
pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
@@ -879,7 +879,7 @@ out_clean:
* space avaiable from the resume partition.
*/
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
@@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags)
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
- if (!enough_swap(pages, flags)) {
+ if (!enough_swap(pages)) {
pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..fc1123583fa6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str)
/*
* Set sysctl string accordingly:
*/
- if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
- memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
- strncpy(devkmsg_log_str, "on", 2);
- } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
- memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
- strncpy(devkmsg_log_str, "off", 3);
- }
+ if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
+ strcpy(devkmsg_log_str, "on");
+ else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
+ strcpy(devkmsg_log_str, "off");
/* else "ratelimit" which is set by default. */
/*
@@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+enum con_msg_format_flags {
+ MSG_FORMAT_DEFAULT = 0,
+ MSG_FORMAT_SYSLOG = (1 << 0),
+};
+
+static int console_msg_format = MSG_FORMAT_DEFAULT;
+
/*
* The printk log buffer consists of a chain of concatenated variable
* length records. Every record starts with a record header, containing
@@ -920,13 +924,13 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
return ret;
}
-static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
- int ret = 0;
+ __poll_t ret = 0;
if (!user)
- return POLLERR|POLLNVAL;
+ return EPOLLERR|EPOLLNVAL;
poll_wait(file, &log_wait, wait);
@@ -934,9 +938,9 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
if (user->seq < log_next_seq) {
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
- ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
- ret = POLLIN|POLLRDNORM;
+ ret = EPOLLIN|EPOLLRDNORM;
}
logbuf_unlock_irq();
@@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
}
/*
+ * Special console_lock variants that help to reduce the risk of soft-lockups.
+ * They allow to pass console_lock to another printk() call using a busy wait.
+ */
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_owner_dep_map = {
+ .name = "console_owner"
+};
+#endif
+
+static DEFINE_RAW_SPINLOCK(console_owner_lock);
+static struct task_struct *console_owner;
+static bool console_waiter;
+
+/**
+ * console_lock_spinning_enable - mark beginning of code where another
+ * thread might safely busy wait
+ *
+ * This basically converts console_lock into a spinlock. This marks
+ * the section where the console_lock owner can not sleep, because
+ * there may be a waiter spinning (like a spinlock). Also it must be
+ * ready to hand over the lock at the end of the section.
+ */
+static void console_lock_spinning_enable(void)
+{
+ raw_spin_lock(&console_owner_lock);
+ console_owner = current;
+ raw_spin_unlock(&console_owner_lock);
+
+ /* The waiter may spin on us after setting console_owner */
+ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+}
+
+/**
+ * console_lock_spinning_disable_and_check - mark end of code where another
+ * thread was able to busy wait and check if there is a waiter
+ *
+ * This is called at the end of the section where spinning is allowed.
+ * It has two functions. First, it is a signal that it is no longer
+ * safe to start busy waiting for the lock. Second, it checks if
+ * there is a busy waiter and passes the lock rights to her.
+ *
+ * Important: Callers lose the lock if there was a busy waiter.
+ * They must not touch items synchronized by console_lock
+ * in this case.
+ *
+ * Return: 1 if the lock rights were passed, 0 otherwise.
+ */
+static int console_lock_spinning_disable_and_check(void)
+{
+ int waiter;
+
+ raw_spin_lock(&console_owner_lock);
+ waiter = READ_ONCE(console_waiter);
+ console_owner = NULL;
+ raw_spin_unlock(&console_owner_lock);
+
+ if (!waiter) {
+ spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ return 0;
+ }
+
+ /* The waiter is now free to continue */
+ WRITE_ONCE(console_waiter, false);
+
+ spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+
+ /*
+ * Hand off console_lock to waiter. The waiter will perform
+ * the up(). After this, the waiter is the console_lock owner.
+ */
+ mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
+ return 1;
+}
+
+/**
+ * console_trylock_spinning - try to get console_lock by busy waiting
+ *
+ * This allows to busy wait for the console_lock when the current
+ * owner is running in specially marked sections. It means that
+ * the current owner is running and cannot reschedule until it
+ * is ready to lose the lock.
+ *
+ * Return: 1 if we got the lock, 0 othrewise
+ */
+static int console_trylock_spinning(void)
+{
+ struct task_struct *owner = NULL;
+ bool waiter;
+ bool spin = false;
+ unsigned long flags;
+
+ if (console_trylock())
+ return 1;
+
+ printk_safe_enter_irqsave(flags);
+
+ raw_spin_lock(&console_owner_lock);
+ owner = READ_ONCE(console_owner);
+ waiter = READ_ONCE(console_waiter);
+ if (!waiter && owner && owner != current) {
+ WRITE_ONCE(console_waiter, true);
+ spin = true;
+ }
+ raw_spin_unlock(&console_owner_lock);
+
+ /*
+ * If there is an active printk() writing to the
+ * consoles, instead of having it write our data too,
+ * see if we can offload that load from the active
+ * printer, and do some printing ourselves.
+ * Go into a spin only if there isn't already a waiter
+ * spinning, and there is an active printer, and
+ * that active printer isn't us (recursive printk?).
+ */
+ if (!spin) {
+ printk_safe_exit_irqrestore(flags);
+ return 0;
+ }
+
+ /* We spin waiting for the owner to release us */
+ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+ /* Owner will clear console_waiter on hand off */
+ while (READ_ONCE(console_waiter))
+ cpu_relax();
+ spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+
+ printk_safe_exit_irqrestore(flags);
+ /*
+ * The owner passed the console lock to us.
+ * Since we did not spin on console lock, annotate
+ * this as a trylock. Otherwise lockdep will
+ * complain.
+ */
+ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+
+ return 1;
+}
+
+/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
@@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level,
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
/*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+ /*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock())
+ if (console_trylock_spinning())
console_unlock();
+ preempt_enable();
}
return printed_len;
@@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
+static void console_lock_spinning_enable(void) { }
+static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
@@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options,
c->index = idx;
return 0;
}
+
+static int __init console_msg_format_setup(char *str)
+{
+ if (!strcmp(str, "syslog"))
+ console_msg_format = MSG_FORMAT_SYSLOG;
+ if (!strcmp(str, "default"))
+ console_msg_format = MSG_FORMAT_DEFAULT;
+ return 1;
+}
+__setup("console_msg_format=", console_msg_format_setup);
+
/*
* Set up a console. Called via do_early_param() in init/main.c
* for each "console=" parameter in the boot command line.
@@ -2069,20 +2233,7 @@ int console_trylock(void)
return 0;
}
console_locked = 1;
- /*
- * When PREEMPT_COUNT disabled we can't reliably detect if it's
- * safe to schedule (e.g. calling printk while holding a spin_lock),
- * because preempt_disable()/preempt_enable() are just barriers there
- * and preempt_count() is always 0.
- *
- * RCU read sections have a separate preemption counter when
- * PREEMPT_RCU enabled thus we must take extra care and check
- * rcu_preempt_depth(), otherwise RCU read sections modify
- * preempt_count().
- */
- console_may_schedule = !oops_in_progress &&
- preemptible() &&
- !rcu_preempt_depth();
+ console_may_schedule = 0;
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -2215,7 +2366,10 @@ skip:
goto skip;
}
- len += msg_print_text(msg, false, text + len, sizeof(text) - len);
+ len += msg_print_text(msg,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ text + len,
+ sizeof(text) - len);
if (nr_ext_console_drivers) {
ext_len = msg_print_ext_header(ext_text,
sizeof(ext_text),
@@ -2229,14 +2383,29 @@ skip:
console_seq++;
raw_spin_unlock(&logbuf_lock);
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ */
+ console_lock_spinning_enable();
+
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
+
+ if (console_lock_spinning_disable_and_check()) {
+ printk_safe_exit_irqrestore(flags);
+ return;
+ }
+
printk_safe_exit_irqrestore(flags);
if (do_cond_resched)
cond_resched();
}
+
console_locked = 0;
/* Release the exclusive_console once it is used */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..21fec73d45d4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *info = *child->last_siginfo;
+ copy_siginfo(info, child->last_siginfo);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *child->last_siginfo = *info;
+ copy_siginfo(child->last_siginfo, info);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = seccomp_get_filter(child, addr, datavp);
break;
+ case PTRACE_SECCOMP_GET_METADATA:
+ ret = seccomp_get_metadata(child, addr, datavp);
+ break;
+
default:
break;
}
@@ -1099,21 +1103,6 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-static struct task_struct *ptrace_get_task_struct(pid_t pid)
-{
- struct task_struct *child;
-
- rcu_read_lock();
- child = find_task_by_vpid(pid);
- if (child)
- get_task_struct(child);
- rcu_read_unlock();
-
- if (!child)
- return ERR_PTR(-ESRCH);
- return child;
-}
-
#ifndef arch_ptrace_attach
#define arch_ptrace_attach(child) do { } while (0)
#endif
@@ -1131,9 +1120,9 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
@@ -1226,7 +1215,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
break;
case PTRACE_SETSIGINFO:
- memset(&siginfo, 0, sizeof siginfo);
if (copy_siginfo_from_user32(
&siginfo, (struct compat_siginfo __user *) datap))
ret = -EFAULT;
@@ -1278,9 +1266,9 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59c471de342a..6334f2c1abd0 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -30,31 +30,8 @@
#define RCU_TRACE(stmt)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/*
- * Process-level increment to ->dynticks_nesting field. This allows for
- * architectures that use half-interrupts and half-exceptions from
- * process context.
- *
- * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
- * that counts the number of process-based reasons why RCU cannot
- * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
- * is the value used to increment or decrement this field.
- *
- * The rest of the bits could in principle be used to count interrupts,
- * but this would mean that a negative-one value in the interrupt
- * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
- * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
- * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
- * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
- * initial exit from idle.
- */
-#define DYNTICK_TASK_NEST_WIDTH 7
-#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
-#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
-#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
-#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
-#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
- DYNTICK_TASK_FLAG)
+/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */
+#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
/*
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 1f87a02c3399..d1ebdf9868bb 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -106,10 +106,6 @@ static int rcu_perf_writer_state;
#define MAX_MEAS 10000
#define MIN_MEAS 100
-static int perf_runnable = IS_ENABLED(MODULE);
-module_param(perf_runnable, int, 0444);
-MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
-
/*
* Operations vector for selecting different types of tests.
*/
@@ -646,7 +642,7 @@ rcu_perf_init(void)
&tasks_ops,
};
- if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+ if (!torture_init_begin(perf_type, verbose))
return -EBUSY;
/* Process args and tell the world that the perf'er is on the job. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 74f6b0146b98..308e6fdbced8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-static int torture_runnable = IS_ENABLED(MODULE);
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
-
#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
@@ -315,11 +311,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp)
}
if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!preempt_count() &&
- !(torture_random(rrsp) % (nrealreaders * 20000)))
- preempt_schedule(); /* No QS if preempt_disable() in effect */
-#endif
+ !(torture_random(rrsp) % (nrealreaders * 500)))
+ torture_preempt_schedule(); /* QS only if preemptible. */
}
static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -1731,7 +1725,7 @@ rcu_torture_init(void)
&sched_ops, &tasks_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d5880089ff6..d5cea81378cc 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_irq_rcu_node(p) \
+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
/*
* Initialize SRCU combining tree. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
@@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
/* Each pass through this loop initializes one srcu_node structure. */
rcu_for_each_node_breadth_first(sp, snp) {
- raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp_first = sp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *sp)
{
- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
return;
}
init_srcu_struct_fields(sp, true);
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
mutex_lock(&sp->srcu_cb_mutex);
/* End the current grace period. */
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
idx = rcu_seq_state(sp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(sp);
@@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
sp->srcu_gp_seq_needed_exp = gpseq;
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
- raw_spin_lock_irq_rcu_node(snp);
+ spin_lock_irq_rcu_node(snp);
cbs = false;
if (snp >= sp->level[rcu_num_lvls - 1])
cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
snp->srcu_gp_seq_needed_exp = gpseq;
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- raw_spin_unlock_irq_rcu_node(snp);
+ spin_unlock_irq_rcu_node(snp);
if (cbs)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
@@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
if (!(gpseq & counter_wrap_check))
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irqsave_rcu_node(sdp, flags);
if (ULONG_CMP_GE(gpseq,
sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
}
}
@@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
mutex_unlock(&sp->srcu_cb_mutex);
/* Start a new grace period if needed. */
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
/* Throttle expedited grace periods: Should be rare! */
srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
? 0 : SRCU_INTERVAL);
} else {
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
}
}
@@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
return;
- raw_spin_lock_irqsave_rcu_node(snp, flags);
+ spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
for (; snp != NULL; snp = snp->srcu_parent) {
if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
return; /* GP already done and CBs recorded. */
- raw_spin_lock_irqsave_rcu_node(snp, flags);
+ spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
snp_seq = snp->srcu_have_cbs[idx];
if (snp == sdp->mynode && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == sdp->mynode && snp_seq != s) {
srcu_schedule_cbs_sdp(sdp, do_norm
? SRCU_INTERVAL
@@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
snp->srcu_gp_seq_needed_exp = s;
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
queue_delayed_work(system_power_efficient_wq, &sp->work,
srcu_get_delay(sp));
}
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rhp->func = func;
local_irq_save(flags);
sdp = this_cpu_ptr(sp->sda);
- raw_spin_lock_rcu_node(sdp);
+ spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
if (needgp)
srcu_funnel_gp_start(sp, sdp, s, do_norm);
else if (needexp)
@@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
/*
* Make sure that later code is ordered after the SRCU grace
- * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * period. This pairs with the spin_lock_irq_rcu_node()
* in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
* because the current CPU might have been totally uninvolved with
* (and thus unordered against) that grace period.
@@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
atomic_inc(&sp->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
@@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp)
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&sp->srcu_barrier_cpu_cnt);
}
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Remove the initial count, at which point reaching zero can happen. */
@@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
*/
idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(sp);
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&sp->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp = container_of(work, struct srcu_data, work.work);
sp = sdp->sp;
rcu_cblist_init(&ready_cbs);
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
@@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&sp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
{
bool pushgp = true;
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(sp);
}
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
if (pushgp)
queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c0ca2ccf0c..491bdf39f276 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -265,25 +265,12 @@ void rcu_bh_qs(void)
#endif
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
/*
- * There's a few places, currently just in the tracing infrastructure,
- * that uses rcu_irq_enter() to make sure RCU is watching. But there's
- * a small location where that will not even work. In those cases
- * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
- * can be called.
- */
-static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
-
-bool rcu_irq_enter_disabled(void)
-{
- return this_cpu_read(disable_rcu_irq_enter);
-}
-
-/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state.
*/
@@ -762,68 +749,39 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * rcu_eqs_enter_common - current CPU is entering an extended quiescent state
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
*
- * Enter idle, doing appropriate accounting. The caller must have
- * disabled interrupts.
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
*/
-static void rcu_eqs_enter_common(bool user)
+static void rcu_eqs_enter(bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_dynticks *rdtp;
- lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
- if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0);
- rcu_ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
+ if (rdtp->dynticks_nesting != 1) {
+ rdtp->dynticks_nesting--;
+ return;
}
+
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- __this_cpu_inc(disable_rcu_irq_enter);
- rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
- rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
- __this_cpu_dec(disable_rcu_irq_enter);
+ WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ rcu_dynticks_eqs_enter();
rcu_dynticks_task_enter();
-
- /*
- * It is illegal to enter an extended quiescent state while
- * in an RCU read-side critical section.
- */
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
-}
-
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- */
-static void rcu_eqs_enter(bool user)
-{
- struct rcu_dynticks *rdtp;
-
- rdtp = this_cpu_ptr(&rcu_dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
- if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
- rcu_eqs_enter_common(user);
- else
- rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
/**
@@ -834,10 +792,6 @@ static void rcu_eqs_enter(bool user)
* critical sections can occur in irq handlers in idle, a possibility
* handled by irq_enter() and irq_exit().)
*
- * We crowbar the ->dynticks_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- *
* If you add or remove a call to rcu_idle_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
@@ -867,6 +821,46 @@ void rcu_user_enter(void)
#endif /* CONFIG_NO_HZ_FULL */
/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void rcu_nmi_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */
+ rdtp->dynticks_nmi_nesting - 2);
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+ rcu_dynticks_eqs_enter();
+}
+
+/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
*
* Exit from an interrupt handler, which might possibly result in entering
@@ -875,8 +869,8 @@ void rcu_user_enter(void)
*
* This code assumes that the idle loop never does anything that might
* result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture violates this assumption, RCU will give you what you
- * deserve, good and hard. But very infrequently and irreproducibly.
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
*
* Use things like work queues to work around this limitation.
*
@@ -887,23 +881,14 @@ void rcu_user_enter(void)
*/
void rcu_irq_exit(void)
{
- struct rcu_dynticks *rdtp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
lockdep_assert_irqs_disabled();
- rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* Page faults can happen in NMI handlers, so check... */
- if (rdtp->dynticks_nmi_nesting)
- return;
-
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdtp->dynticks_nesting < 1);
- if (rdtp->dynticks_nesting <= 1) {
- rcu_eqs_enter_common(true);
- } else {
- trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
- rdtp->dynticks_nesting--;
- }
+ if (rdtp->dynticks_nmi_nesting == 1)
+ rcu_prepare_for_idle();
+ rcu_nmi_exit();
+ if (rdtp->dynticks_nmi_nesting == 0)
+ rcu_dynticks_task_enter();
}
/*
@@ -922,55 +907,33 @@ void rcu_irq_exit_irqson(void)
}
/*
- * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
- *
- * If the new value of the ->dynticks_nesting counter was previously zero,
- * we really have exited idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_eqs_exit_common(long long oldval, int user)
-{
- RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
-
- rcu_dynticks_task_exit();
- rcu_dynticks_eqs_exit();
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on exit: not idle task"),
- oldval, rdtp->dynticks_nesting);
- rcu_ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
-/*
* Exit an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
*/
static void rcu_eqs_exit(bool user)
{
struct rcu_dynticks *rdtp;
- long long oldval;
+ long oldval;
lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- } else {
- __this_cpu_inc(disable_rcu_irq_enter);
- rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(oldval, user);
- __this_cpu_dec(disable_rcu_irq_enter);
+ if (oldval) {
+ rdtp->dynticks_nesting++;
+ return;
}
+ rcu_dynticks_task_exit();
+ rcu_dynticks_eqs_exit();
+ rcu_cleanup_after_idle();
+ trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(rdtp->dynticks_nesting, 1);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
}
/**
@@ -979,11 +942,6 @@ static void rcu_eqs_exit(bool user)
* Exit idle mode, in other words, -enter- the mode in which RCU
* read-side critical sections can occur.
*
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
- * allow for the possibility of usermode upcalls messing up our count
- * of interrupt nesting level during the busy period that is just
- * now starting.
- *
* If you add or remove a call to rcu_idle_exit(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
@@ -1013,65 +971,6 @@ void rcu_user_exit(void)
#endif /* CONFIG_NO_HZ_FULL */
/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to
- * user mode! This code assumes that the idle loop never does upcalls to
- * user mode. If your architecture does do upcalls from the idle loop (or
- * does anything else that results in unbalanced calls to the irq_enter()
- * and irq_exit() functions), RCU will give you what you deserve, good
- * and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter(void)
-{
- struct rcu_dynticks *rdtp;
- long long oldval;
-
- lockdep_assert_irqs_disabled();
- rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* Page faults can happen in NMI handlers, so check... */
- if (rdtp->dynticks_nmi_nesting)
- return;
-
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdtp->dynticks_nesting == 0);
- if (oldval)
- trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
- else
- rcu_eqs_exit_common(oldval, true);
-}
-
-/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
-}
-
-/**
* rcu_nmi_enter - inform RCU of entry to NMI context
*
* If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
@@ -1086,7 +985,7 @@ void rcu_irq_enter_irqson(void)
void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int incby = 2;
+ long incby = 2;
/* Complain about underflow. */
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
@@ -1103,45 +1002,61 @@ void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
incby = 1;
}
- rdtp->dynticks_nmi_nesting += incby;
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ rdtp->dynticks_nmi_nesting,
+ rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */
+ rdtp->dynticks_nmi_nesting + incby);
barrier();
}
/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
*
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
*
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_nmi_exit(void)
+void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+ lockdep_assert_irqs_disabled();
+ if (rdtp->dynticks_nmi_nesting == 0)
+ rcu_dynticks_task_exit();
+ rcu_nmi_enter();
+ if (rdtp->dynticks_nmi_nesting == 1)
+ rcu_cleanup_after_idle();
+}
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdtp->dynticks_nmi_nesting != 1) {
- rdtp->dynticks_nmi_nesting -= 2;
- return;
- }
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void rcu_irq_enter_irqson(void)
+{
+ unsigned long flags;
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- rdtp->dynticks_nmi_nesting = 0;
- rcu_dynticks_eqs_enter();
+ local_irq_save(flags);
+ rcu_irq_enter();
+ local_irq_restore(flags);
}
/**
@@ -1233,7 +1148,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+ return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 &&
+ __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1;
}
/*
@@ -2789,6 +2705,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->n_force_qs_snap = rsp->n_force_qs;
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
+
+ /*
+ * The following usually indicates a double call_rcu(). To track
+ * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
+ */
WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
local_irq_restore(flags);
@@ -3723,7 +3644,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
- WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+ WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
rdp->cpu = cpu;
rdp->rsp = rsp;
@@ -3752,7 +3673,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
!init_nocb_callback_list(rdp))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
- rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 46a5d1991450..6488a3b0e729 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -38,9 +38,8 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- long long dynticks_nesting; /* Track irq/process nesting level. */
- /* Process level is worth LLONG_MAX/2. */
- int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ long dynticks_nesting; /* Track process nesting level. */
+ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index db85ca3975f1..fb88a028deec 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
@@ -1687,7 +1686,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void)
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- have_rcu_nocb_mask = true;
cpulist_parse(str, rcu_nocb_mask);
return 1;
}
@@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
- if (have_rcu_nocb_mask)
+ if (cpumask_available(rcu_nocb_mask))
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
@@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
- if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+ if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
return;
}
- have_rcu_nocb_mask = true;
}
- if (!have_rcu_nocb_mask)
+ if (!cpumask_available(rcu_nocb_mask))
return;
#if defined(CONFIG_NO_HZ_FULL)
@@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
- if (!have_rcu_nocb_mask)
+ if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fbd56d6e575b..68fa19a5e7bd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(init_rcu_head);
void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_rcu_head);
static bool rcuhead_is_static_object(void *addr)
{
diff --git a/kernel/relay.c b/kernel/relay.c
index 39a9dfc69486..c3029402f15c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -611,7 +611,6 @@ free_bufs:
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
- kfree(chan);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
@@ -919,18 +918,18 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
*
* Poll implemention.
*/
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
{
- unsigned int mask = 0;
+ __poll_t mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
- return POLLERR;
+ return EPOLLERR;
if (filp->f_mode & FMODE_READ) {
poll_wait(filp, &buf->read_wait, wait);
if (!relay_buf_empty(buf))
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
}
return mask;
diff --git a/kernel/resource.c b/kernel/resource.c
index 54ba6de3757c..e270b5048988 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root,
struct resource *conflict;
struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
+ int type = resource_type(root);
if (!res)
return;
@@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root,
res->name = name;
res->start = start;
res->end = end;
- res->flags = IORESOURCE_BUSY;
+ res->flags = type | IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
while (1) {
@@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root,
next_res->name = name;
next_res->start = conflict->end + 1;
next_res->end = end;
- next_res->flags = IORESOURCE_BUSY;
+ next_res->flags = type | IORESOURCE_BUSY;
next_res->desc = IORES_DESC_NONE;
}
} else {
@@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent,
EXPORT_SYMBOL(__devm_release_region);
/*
- * Called from init/main.c to reserve IO ports.
+ * Reserve I/O ports or memory based on "reserve=" kernel parameter.
*/
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
@@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str)
for (;;) {
unsigned int io_start, io_num;
int x = reserved;
+ struct resource *parent;
- if (get_option (&str, &io_start) != 2)
+ if (get_option(&str, &io_start) != 2)
break;
- if (get_option (&str, &io_num) == 0)
+ if (get_option(&str, &io_num) == 0)
break;
if (x < MAXRESERVE) {
struct resource *res = reserve + x;
+
+ /*
+ * If the region starts below 0x10000, we assume it's
+ * I/O port space; otherwise assume it's memory.
+ */
+ if (io_start < 0x10000) {
+ res->flags = IORESOURCE_IO;
+ parent = &ioport_resource;
+ } else {
+ res->flags = IORESOURCE_MEM;
+ parent = &iomem_resource;
+ }
res->name = "reserved";
res->start = io_start;
res->end = io_start + io_num - 1;
- res->flags = IORESOURCE_BUSY;
+ res->flags |= IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
res->child = NULL;
- if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+ if (request_resource(parent, res) == 0)
reserved = x+1;
}
}
return 1;
}
-
__setup("reserve=", reserve_setup);
/*
@@ -1563,17 +1576,17 @@ static int strict_iomem_checks;
/*
* check if an address is reserved in the iomem resource tree
- * returns 1 if reserved, 0 if not reserved.
+ * returns true if reserved, false if not reserved.
*/
-int iomem_is_exclusive(u64 addr)
+bool iomem_is_exclusive(u64 addr)
{
struct resource *p = &iomem_resource;
- int err = 0;
+ bool err = false;
loff_t l;
int size = PAGE_SIZE;
if (!strict_iomem_checks)
- return 0;
+ return false;
addr = addr & PAGE_MASK;
@@ -1596,7 +1609,7 @@ int iomem_is_exclusive(u64 addr)
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
- err = 1;
+ err = true;
break;
}
}
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index a43df5193538..bb4b9fe026a1 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,13 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/security.h>
#include <linux/export.h>
+#include "sched.h"
+
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
- /*
- * Perform commit of crossrelease here.
- */
- complete_release_commit(x);
-
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..e7c535eee0a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -508,7 +508,8 @@ void resched_cpu(int cpu)
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- resched_curr(rq);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
- schedstat_inc(rq->ttwu_local);
- schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd->ttwu_wake_remote);
+ __schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq->ttwu_count);
- schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
- * Pairs with the smp_store_release() in finish_lock_switch().
+ * Pairs with the smp_store_release() in finish_task().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
@@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
+ p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -2571,6 +2573,62 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
#endif /* CONFIG_PREEMPT_NOTIFIERS */
+static inline void prepare_task(struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
+{
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ rq_unpin_lock(rq, rf);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = next;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -2591,7 +2649,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
- prepare_lock_switch(rq, next);
+ prepare_task(next);
prepare_arch_switch(next);
}
@@ -2646,29 +2704,34 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* the scheduled task must drop that reference.
*
* We must observe prev->state before clearing prev->on_cpu (in
- * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * finish_task), otherwise a concurrent wakeup can get prev
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
- /*
- * The membarrier system call requires a full memory barrier
- * after storing to rq->curr, before going back to user-space.
- *
- * TODO: This smp_mb__after_unlock_lock can go away if PPC end
- * up adding a full barrier to switch_mm(), or we should figure
- * out if a smp_mb__after_unlock_lock is really the proper API
- * to use.
- */
- smp_mb__after_unlock_lock();
- finish_lock_switch(rq, prev);
+ finish_task(prev);
+ finish_lock_switch(rq);
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
- if (mm)
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
+ }
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2772,6 +2835,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
@@ -2786,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- /*
- * Since the runqueue lock will be released by the next
- * task (which is an invalid locking op but in the case
- * of the scheduler it's an obvious special-case), so we
- * do an early lockdep release here:
- */
- rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -3308,6 +3371,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -3355,17 +3421,16 @@ static void __sched notrace __schedule(bool preempt)
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
- * rq->curr, before returning to user-space. For TSO
- * (e.g. x86), the architecture must provide its own
- * barrier in switch_mm(). For weakly ordered machines
- * for which spin_unlock() acts as a full memory
- * barrier, finish_lock_switch() in common code takes
- * care of this barrier. For weakly ordered machines for
- * which spin_unlock() acts as a RELEASE barrier (only
- * arm64 and PowerPC), arm64 has a full barrier in
- * switch_to(), and PowerPC has
- * smp_mb__after_unlock_lock() before
- * finish_lock_switch().
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
*/
++*switch_count;
@@ -4040,8 +4105,7 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags &
- ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
/*
@@ -4108,6 +4172,9 @@ recheck:
}
if (user) {
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
retval = security_task_setscheduler(p);
if (retval)
return retval;
@@ -4163,7 +4230,8 @@ change:
}
#endif
#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
/*
@@ -4293,6 +4361,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
}
EXPORT_SYMBOL_GPL(sched_setattr);
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -4799,7 +4872,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
@@ -5097,17 +5170,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
return ret;
}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
@@ -5144,6 +5206,17 @@ out_unlock:
return retval;
}
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..7936f548e071 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,8 +19,6 @@
#include "sched.h"
-#define SUGOV_KTHREAD_PRIORITY 50
-
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
@@ -60,7 +58,8 @@ struct sugov_cpu {
u64 last_update;
/* The fields below are only needed when sharing a policy. */
- unsigned long util;
+ unsigned long util_cfs;
+ unsigned long util_dl;
unsigned long max;
unsigned int flags;
@@ -176,21 +175,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long cfs_max;
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
- cfs_max = arch_scale_cpu_capacity(NULL, cpu);
+ sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ sg_cpu->util_cfs = cpu_util_cfs(rq);
+ sg_cpu->util_dl = cpu_util_dl(rq);
+}
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
+{
+ /*
+ * Ideally we would like to set util_dl as min/guaranteed freq and
+ * util_cfs + util_dl as requested freq. However, cpufreq is not yet
+ * ready for such an interface. So, we only do the latter for now.
+ */
+ return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- unsigned int flags)
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
{
- if (flags & SCHED_CPUFREQ_IOWAIT) {
+ if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
@@ -244,7 +250,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls();
+ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
bool ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
@@ -264,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (!sugov_should_update_freq(sg_policy, time))
@@ -272,10 +278,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max, sg_cpu->cpu);
+ sugov_get_util(sg_cpu);
+ max = sg_cpu->max;
+ util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -305,23 +313,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
s64 delta_ns;
/*
- * If the CPU utilization was last updated before the previous
- * frequency update and the time elapsed between the last update
- * of the CPU utilization and the last frequency update is long
- * enough, don't take the CPU into account as it probably is
- * idle now (and clear iowait_boost for it).
+ * If the CFS CPU utilization was last updated before the
+ * previous frequency update and the time elapsed between the
+ * last update of the CPU utilization and the last frequency
+ * update is long enough, reset iowait_boost and util_cfs, as
+ * they are now probably stale. However, still consider the
+ * CPU contribution if it has some DEADLINE utilization
+ * (util_dl).
*/
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false;
- continue;
+ j_sg_cpu->util_cfs = 0;
+ if (j_sg_cpu->util_dl == 0)
+ continue;
}
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
- j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
+ j_util = sugov_aggregate_util(j_sg_cpu);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
@@ -338,22 +350,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max, sg_cpu->cpu);
-
raw_spin_lock(&sg_policy->update_lock);
- sg_cpu->util = util;
- sg_cpu->max = max;
+ sugov_get_util(sg_cpu);
sg_cpu->flags = flags;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- if (flags & SCHED_CPUFREQ_RT_DL)
+ if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time);
@@ -383,9 +391,9 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
/*
- * For RT and deadline tasks, the schedutil governor shoots the
- * frequency to maximum. Special care must be taken to ensure that this
- * kthread doesn't result in the same behavior.
+ * For RT tasks, the schedutil governor shoots the frequency to maximum.
+ * Special care must be taken to ensure that this kthread doesn't result
+ * in the same behavior.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() function and before that
@@ -470,7 +478,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
+ /*
+ * Fake (unused) bandwidth; workaround to "fix"
+ * priority inheritance.
+ */
+ .sched_runtime = 1000000,
+ .sched_deadline = 10000000,
+ .sched_period = 10000000,
+ };
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
@@ -488,10 +509,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
return PTR_ERR(thread);
}
- ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ ret = sched_setattr_nocheck(thread, &attr);
if (ret) {
kthread_stop(thread);
- pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
return ret;
}
@@ -655,7 +676,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2473736c7616..9df09782025c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)
#endif
static inline
-void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
}
static inline
-void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
}
+static inline
+void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_running_bw(dl_se->dl_bw, dl_rq);
+}
+
void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
+ BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+
if (task_on_rq_queued(p))
return;
rq = task_rq(p);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
- add_rq_bw(new_bw, &rq->dl);
+ __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __add_rq_bw(new_bw, &rq->dl);
}
/*
@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)
if (dl_se->dl_runtime == 0)
return;
+ if (dl_entity_is_special(dl_se))
+ return;
+
WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)
*/
if (zerolag_time < 0) {
if (dl_task(p))
- sub_running_bw(dl_se->dl_bw, dl_rq);
+ sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD)
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
return;
if (flags & ENQUEUE_MIGRATED)
- add_rq_bw(dl_se->dl_bw, dl_rq);
+ add_rq_bw(dl_se, dl_rq);
if (dl_se->dl_non_contending) {
dl_se->dl_non_contending = 0;
@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* when the "inactive timer" fired).
* So, add it back.
*/
- add_running_bw(dl_se->dl_bw, dl_rq);
+ add_running_bw(dl_se, dl_rq);
}
}
@@ -1114,7 +1151,9 @@ static void update_curr_dl(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec;
+ u64 delta_exec, scaled_delta_exec;
+ int cpu = cpu_of(rq);
+ u64 now;
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -1127,34 +1166,58 @@ static void update_curr_dl(struct rq *rq)
* natural solution, but the full ramifications of this
* approach need further study.
*/
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return;
}
- /* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
-
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq_clock_task(rq);
+ curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
- if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
- delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
- dl_se->runtime -= delta_exec;
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ /*
+ * For tasks that participate in GRUB, we implement GRUB-PA: the
+ * spare reclaimed bandwidth is used to clock down frequency.
+ *
+ * For the others, we still need to scale reservation parameters
+ * according to current frequency and CPU maximum capacity.
+ */
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+ scaled_delta_exec = grub_reclaim(delta_exec,
+ rq,
+ &curr->dl);
+ } else {
+ unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+ scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ }
+
+ dl_se->runtime -= scaled_delta_exec;
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
+
+ /* If requested, inform the user about runtime overruns. */
+ if (dl_runtime_exceeded(dl_se) &&
+ (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+ dl_se->dl_overrun = 1;
+
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
@@ -1204,8 +1267,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
- sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
+ sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
}
@@ -1222,7 +1285,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
- sub_running_bw(dl_se->dl_bw, &rq->dl);
+ sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1416,8 +1479,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
dl_check_constrained_dl(&p->dl);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
- add_running_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
+ add_running_bw(&p->dl, &rq->dl);
}
/*
@@ -1457,8 +1520,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
__dequeue_task_dl(rq, p, flags);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
}
/*
@@ -1564,7 +1627,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
*/
raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -1576,7 +1639,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_unlock(&rq->lock);
}
@@ -2019,11 +2082,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(next_task->dl.dl_bw, &rq->dl);
- sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_running_bw(&next_task->dl, &rq->dl);
+ sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
- add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_rq_bw(&next_task->dl, &later_rq->dl);
+ add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -2111,11 +2174,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(p->dl.dl_bw, &src_rq->dl);
- sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_running_bw(&p->dl, &src_rq->dl);
+ sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(p->dl.dl_bw, &this_rq->dl);
- add_running_bw(p->dl.dl_bw, &this_rq->dl);
+ add_rq_bw(&p->dl, &this_rq->dl);
+ add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2224,7 +2287,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
task_non_contending(p);
if (!task_on_rq_queued(p))
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
/*
* We cannot use inactive_task_timer() to invoke sub_running_bw()
@@ -2256,7 +2319,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
return;
}
@@ -2435,6 +2498,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return 0;
+
/* !deadline task may carry old deadline bandwidth */
if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
@@ -2521,6 +2587,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ /* special dl tasks don't actually use any parameter */
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return true;
+
/* deadline != 0 */
if (attr->sched_deadline == 0)
return false;
@@ -2566,6 +2636,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..5eb3ffc9be84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
- schedstat_set(se->statistics.wait_start, wait_start);
+ __schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
@@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- schedstat_set(se->statistics.wait_start, delta);
+ __schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- schedstat_set(se->statistics.wait_max,
+ __schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
- schedstat_inc(se->statistics.wait_count);
- schedstat_add(se->statistics.wait_sum, delta);
- schedstat_set(se->statistics.wait_start, 0);
+ __schedstat_inc(se->statistics.wait_count);
+ __schedstat_add(se->statistics.wait_sum, delta);
+ __schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- schedstat_set(se->statistics.sleep_max, delta);
+ __schedstat_set(se->statistics.sleep_max, delta);
- schedstat_set(se->statistics.sleep_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.sleep_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
@@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- schedstat_set(se->statistics.block_max, delta);
+ __schedstat_set(se->statistics.block_max, delta);
- schedstat_set(se->statistics.block_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.block_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
- schedstat_add(se->statistics.iowait_sum, delta);
- schedstat_inc(se->statistics.iowait_count);
+ __schedstat_add(se->statistics.iowait_sum, delta);
+ __schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
@@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- schedstat_set(se->statistics.sleep_start,
+ __schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- schedstat_set(se->statistics.block_start,
+ __schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
+ * a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib;
@@ -4365,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -5689,28 +5685,38 @@ static int wake_wide(struct task_struct *p)
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- * will be) idle.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ * cache-affine and is (or will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
-
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
- if (idle_cpu(this_cpu))
- return true;
+ /*
+ * If this_cpu is idle, it implies the wakeup is from interrupt
+ * context. Only allow the move if cache is shared. Otherwise an
+ * interrupt intensive workload could force all tasks onto one
+ * node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
+ */
+ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return true;
+ return this_cpu;
- return false;
+ return nr_cpumask_bits;
}
-static bool
+static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5724,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load)
- return true;
+ return this_cpu;
this_eff_load -= current_load;
}
@@ -5741,36 +5747,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
- return this_eff_load <= prev_eff_load;
+ return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ int target = nr_cpumask_bits;
- if (sched_feat(WA_IDLE) && !affine)
- affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE))
+ target = wake_affine_idle(this_cpu, prev_cpu, sync);
- if (sched_feat(WA_WEIGHT) && !affine)
- affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+ target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (affine) {
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
- }
+ if (target == nr_cpumask_bits)
+ return prev_cpu;
- return affine;
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ return target;
}
-static inline int task_util(struct task_struct *p);
-static int cpu_util_wake(int cpu, struct task_struct *p);
+static inline unsigned long task_util(struct task_struct *p);
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
{
- return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
}
/*
@@ -5950,7 +5956,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(cpu_rq(i));
- if (load < min_load || (load == min_load && i == this_cpu)) {
+ if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
@@ -6191,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ int i, recent_used_cpu;
if (idle_cpu(target))
return target;
@@ -6202,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev;
+ /* Check a recently used CPU as a potential idle candidate */
+ recent_used_cpu = p->recent_used_cpu;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ idle_cpu(recent_used_cpu) &&
+ cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ /*
+ * Replace recent_used_cpu with prev as it is a potential
+ * candidate for the next wake.
+ */
+ p->recent_used_cpu = prev;
+ return recent_used_cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -6247,7 +6268,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*/
-static int cpu_util(int cpu)
+static unsigned long cpu_util(int cpu)
{
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
@@ -6255,7 +6276,7 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util;
}
-static inline int task_util(struct task_struct *p)
+static inline unsigned long task_util(struct task_struct *p)
{
return p->se.avg.util_avg;
}
@@ -6264,7 +6285,7 @@ static inline int task_util(struct task_struct *p)
* cpu_util_wake: Compute cpu utilization with any contributions from
* the waking task p removed.
*/
-static int cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
unsigned long util, capacity;
@@ -6355,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu)
goto pick_cpu;
- if (wake_affine(affine_sd, p, prev_cpu, sync))
- new_cpu = cpu;
+ new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6370,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (!sd) {
pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
+ }
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
@@ -6449,8 +6472,7 @@ static void task_dead_fair(struct task_struct *p)
}
#endif /* CONFIG_SMP */
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -6492,7 +6514,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
- gran = wakeup_gran(curr, se);
+ gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..5d0762633639 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,24 +26,110 @@
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
- (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
-static int membarrier_private_expedited(void)
+static int membarrier_global_expedited(void)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
- if (!(atomic_read(&current->mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
- return -EPERM;
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static int membarrier_private_expedited(int flags)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ } else {
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
if (num_online_cpus() == 1)
return 0;
@@ -89,7 +175,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
@@ -103,21 +191,69 @@ static int membarrier_private_expedited(void)
return 0;
}
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ if (atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+ /*
+ * For single mm user, single threaded process, we can
+ * simply issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+ * no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+ } else {
+ /*
+ * For multi-mm user threads, we need to ensure all
+ * future scheduler executions will observe the new
+ * thread flag state for this mm.
+ */
+ synchronize_sched();
+ }
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ &mm->membarrier_state);
+ return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ }
+
/*
* We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
- return;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
- &mm->membarrier_state);
+ if (atomic_read(&mm->membarrier_state) & state)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+ &mm->membarrier_state);
+ if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
+ /*
+ * Ensure all future scheduler executions will observe the
+ * new thread flag state for this process.
+ */
+ synchronize_sched();
+ }
+ atomic_or(state, &mm->membarrier_state);
+ return 0;
}
/**
@@ -157,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
int cmd_mask = MEMBARRIER_CMD_BITMASK;
if (tick_nohz_full_enabled())
- cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
return cmd_mask;
}
- case MEMBARRIER_CMD_SHARED:
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ case MEMBARRIER_CMD_GLOBAL:
+ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
if (tick_nohz_full_enabled())
return -EINVAL;
if (num_online_cpus() > 1)
synchronize_sched();
return 0;
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ return membarrier_global_expedited();
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited();
+ return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
- membarrier_register_private_expedited();
- return 0;
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
default:
return -EINVAL;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..aad49451584e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -951,11 +951,13 @@ static void update_curr_rt(struct rq *rq)
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
u64 delta_exec;
+ u64 now;
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -968,7 +970,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq_clock_task(rq);
+ curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -1907,9 +1909,8 @@ static void push_rt_tasks(struct rq *rq)
* the rt_loop_next will cause the iterator to perform another scan.
*
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- struct root_domain *rd = rq->rd;
int next;
int cpu;
@@ -1985,19 +1986,24 @@ static void tell_cpu_to_push(struct rq *rq)
* Otherwise it is finishing up and an ipi needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rq->rd);
raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start);
- if (cpu >= 0)
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
void rto_push_irq_work_func(struct irq_work *work)
{
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
struct rq *rq;
int cpu;
@@ -2013,18 +2019,20 @@ void rto_push_irq_work_func(struct irq_work *work)
raw_spin_unlock(&rq->lock);
}
- raw_spin_lock(&rq->rd->rto_lock);
+ raw_spin_lock(&rd->rto_lock);
/* Pass the IPI to the next rt overloaded queue */
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rd);
- raw_spin_unlock(&rq->rd->rto_lock);
+ raw_spin_unlock(&rd->rto_lock);
- if (cpu < 0)
+ if (cpu < 0) {
+ sched_put_rd(rd);
return;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -2034,8 +2042,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2044,6 +2053,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
@@ -2206,7 +2220,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a212de..fb5fc458547f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ *
+ * SUGOV stands for SchedUtil GOVernor.
+ */
+#define SCHED_FLAG_SUGOV 0x10000000
+
+static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+{
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
+#else
+ return false;
+#endif
+}
+
/*
* Tells if entity @a should preempt entity @b.
*/
static inline bool
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
{
- return dl_time_before(a->deadline, b->deadline);
+ return dl_entity_is_special(a) ||
+ dl_time_before(a->deadline, b->deadline);
}
/*
@@ -665,6 +691,8 @@ extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
@@ -1328,47 +1356,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- *
- * In particular, the load of prev->state in finish_task_switch() must
- * happen before this.
- *
- * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
- */
- smp_store_release(&prev->on_cpu, 0);
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
- raw_spin_unlock_irq(&rq->lock);
-}
-
/*
* wake flags
*/
@@ -1687,17 +1674,17 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
#ifndef arch_scale_freq_capacity
static __always_inline
-unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1711,10 +1698,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq);
}
#else
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
@@ -2096,14 +2090,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
- * That is not guaranteed to happen if the updates are only triggered from CFS,
- * though, because they may not be coming in if RT or deadline tasks are active
- * all the time (or there are RT and DL tasks only).
+ * That is not guaranteed to happen if the updates are only triggered from CFS
+ * and DL, though, because they may not be coming in if only RT tasks are
+ * active all the time (or there are RT tasks only).
*
- * As a workaround for that issue, this function is called by the RT and DL
- * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * As a workaround for that issue, this function is called periodically by the
+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
- * solutions targeted more specifically at RT and DL tasks.
+ * solutions targeted more specifically at RT tasks.
*/
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
@@ -2125,3 +2119,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+ return rq->cfs.avg.util_avg;
+}
+
+#endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baf500d12b7c..8e7b58de61e7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
rq->rq_sched_info.run_delay += delta;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
@@ -48,8 +51,11 @@ static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
#define schedstat_enabled() 0
+#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0)
+#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
+#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 034cbed7f88b..519b024f4e94 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
static int init_rootdomain(struct root_domain *rd)
{
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f0dfb2abb8d..940fa408a288 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk)
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
- memset(info, 0, sizeof(*info));
+ clear_siginfo(info);
info->si_signo = SIGSYS;
info->si_code = SYS_SECCOMP;
info->si_call_addr = (void __user *)KSTK_EIP(current);
@@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
}
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
-long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
- void __user *data)
+static struct seccomp_filter *get_nth_filter(struct task_struct *task,
+ unsigned long filter_off)
{
- struct seccomp_filter *filter;
- struct sock_fprog_kern *fprog;
- long ret;
- unsigned long count = 0;
-
- if (!capable(CAP_SYS_ADMIN) ||
- current->seccomp.mode != SECCOMP_MODE_DISABLED) {
- return -EACCES;
- }
+ struct seccomp_filter *orig, *filter;
+ unsigned long count;
+ /*
+ * Note: this is only correct because the caller should be the (ptrace)
+ * tracer of the task, otherwise lock_task_sighand is needed.
+ */
spin_lock_irq(&task->sighand->siglock);
+
if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
- ret = -EINVAL;
- goto out;
+ spin_unlock_irq(&task->sighand->siglock);
+ return ERR_PTR(-EINVAL);
}
- filter = task->seccomp.filter;
- while (filter) {
- filter = filter->prev;
+ orig = task->seccomp.filter;
+ __get_seccomp_filter(orig);
+ spin_unlock_irq(&task->sighand->siglock);
+
+ count = 0;
+ for (filter = orig; filter; filter = filter->prev)
count++;
- }
if (filter_off >= count) {
- ret = -ENOENT;
+ filter = ERR_PTR(-ENOENT);
goto out;
}
- count -= filter_off;
- filter = task->seccomp.filter;
- while (filter && count > 1) {
- filter = filter->prev;
+ count -= filter_off;
+ for (filter = orig; filter && count > 1; filter = filter->prev)
count--;
- }
if (WARN_ON(count != 1 || !filter)) {
- /* The filter tree shouldn't shrink while we're using it. */
- ret = -ENOENT;
+ filter = ERR_PTR(-ENOENT);
goto out;
}
+ __get_seccomp_filter(filter);
+
+out:
+ __put_seccomp_filter(orig);
+ return filter;
+}
+
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+ void __user *data)
+{
+ struct seccomp_filter *filter;
+ struct sock_fprog_kern *fprog;
+ long ret;
+
+ if (!capable(CAP_SYS_ADMIN) ||
+ current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+ return -EACCES;
+ }
+
+ filter = get_nth_filter(task, filter_off);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+
fprog = filter->prog->orig_prog;
if (!fprog) {
/* This must be a new non-cBPF filter, since we save
@@ -1035,17 +1054,44 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- __get_seccomp_filter(filter);
- spin_unlock_irq(&task->sighand->siglock);
-
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
+out:
__put_seccomp_filter(filter);
return ret;
+}
-out:
- spin_unlock_irq(&task->sighand->siglock);
+long seccomp_get_metadata(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ long ret;
+ struct seccomp_filter *filter;
+ struct seccomp_metadata kmd = {};
+
+ if (!capable(CAP_SYS_ADMIN) ||
+ current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+ return -EACCES;
+ }
+
+ size = min_t(unsigned long, size, sizeof(kmd));
+
+ if (copy_from_user(&kmd, data, size))
+ return -EFAULT;
+
+ filter = get_nth_filter(task, kmd.filter_off);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+
+ memset(&kmd, 0, sizeof(kmd));
+ if (filter->log)
+ kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
+
+ ret = size;
+ if (copy_to_user(data, &kmd, size))
+ ret = -EFAULT;
+
+ __put_seccomp_filter(filter);
return ret;
}
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 9558664bd9ec..c6e4c83dc090 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -40,6 +40,7 @@
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
+#include <linux/livepatch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -165,7 +166,8 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (!recalc_sigpending_tsk(current) && !freezing(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current) &&
+ !klp_patch_pending(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -549,6 +551,7 @@ still_pending:
* a fast-pathed signal or we must have been
* out of queue space. So zero out the info.
*/
+ clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
@@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
spin_unlock(&tsk->sighand->siglock);
posixtimer_rearm(info);
spin_lock(&tsk->sighand->siglock);
+
+ /* Don't expose the si_sys_private value to userspace */
+ info->si_sys_private = 0;
}
#endif
return signr;
@@ -1043,6 +1049,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
case (unsigned long) SEND_SIG_NOINFO:
+ clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
@@ -1051,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
break;
case (unsigned long) SEND_SIG_PRIV:
+ clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_KERNEL;
@@ -1485,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p)
return 0;
}
+int force_sig_fault(int sig, int code, void __user *addr
+ ___ARCH_SI_TRAPNO(int trapno)
+ ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+ , struct task_struct *t)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+#ifdef __ARCH_SI_TRAPNO
+ info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+ info.si_imm = imm;
+ info.si_flags = flags;
+ info.si_isr = isr;
+#endif
+ return force_sig_info(info.si_signo, &info, t);
+}
+
+int send_sig_fault(int sig, int code, void __user *addr
+ ___ARCH_SI_TRAPNO(int trapno)
+ ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+ , struct task_struct *t)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+#ifdef __ARCH_SI_TRAPNO
+ info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+ info.si_imm = imm;
+ info.si_flags = flags;
+ info.si_isr = isr;
+#endif
+ return send_sig_info(info.si_signo, &info, t);
+}
+
+#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
+int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+ struct siginfo info;
+
+ WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_addr_lsb = lsb;
+ return force_sig_info(info.si_signo, &info, t);
+}
+
+int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+ struct siginfo info;
+
+ WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_addr_lsb = lsb;
+ return send_sig_info(info.si_signo, &info, t);
+}
+EXPORT_SYMBOL(send_sig_mceerr);
+#endif
+
+#ifdef SEGV_BNDERR
+int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_BNDERR;
+ info.si_addr = addr;
+ info.si_lower = lower;
+ info.si_upper = upper;
+ return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+
+#ifdef SEGV_PKUERR
+int force_sig_pkuerr(void __user *addr, u32 pkey)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_PKUERR;
+ info.si_addr = addr;
+ info.si_pkey = pkey;
+ return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+
+/* For the crazy architectures that include trap information in
+ * the errno field, instead of an actual errno value.
+ */
+int force_sig_ptrace_errno_trap(int errno, void __user *addr)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGTRAP;
+ info.si_errno = errno;
+ info.si_code = TRAP_HWBKPT;
+ info.si_addr = addr;
+ return force_sig_info(info.si_signo, &info, current);
+}
+
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
@@ -1623,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
sig = SIGCHLD;
}
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
/*
@@ -1717,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
parent = tsk->real_parent;
}
+ clear_siginfo(&info);
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
@@ -1929,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
{
siginfo_t info;
- memset(&info, 0, sizeof info);
+ clear_siginfo(&info);
info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
@@ -2136,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
* have updated *info via PTRACE_SETSIGINFO.
*/
if (signr != info->si_signo) {
+ clear_siginfo(info);
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
@@ -2688,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
#endif
[SIGCHLD] = { NSIGCHLD, SIL_CHLD },
[SIGPOLL] = { NSIGPOLL, SIL_POLL },
-#ifdef __ARCH_SIGSYS
[SIGSYS] = { NSIGSYS, SIL_SYS },
-#endif
};
if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
layout = filter[sig].layout;
@@ -2712,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
if ((sig == SIGFPE) && (si_code == FPE_FIXME))
layout = SIL_FAULT;
#endif
+#ifdef BUS_FIXME
+ if ((sig == SIGBUS) && (si_code == BUS_FIXME))
+ layout = SIL_FAULT;
+#endif
}
return layout;
}
-#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
int err;
@@ -2756,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
-#ifdef BUS_MCEERR_AO
+#ifdef __ia64__
+ err |= __put_user(from->si_imm, &to->si_imm);
+ err |= __put_user(from->si_flags, &to->si_flags);
+ err |= __put_user(from->si_isr, &to->si_isr);
+#endif
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (from->si_signo == SIGBUS &&
- (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
+#ifdef BUS_MCEERR_AR
+ if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
+ err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
+#ifdef BUS_MCEERR_AO
+ if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
#ifdef SEGV_BNDERR
@@ -2788,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
-#ifdef __ARCH_SIGSYS
case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
-#endif
}
return err;
}
+#ifdef CONFIG_COMPAT
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct siginfo *from)
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+{
+ return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct siginfo *from, bool x32_ABI)
+#endif
+{
+ struct compat_siginfo new;
+ memset(&new, 0, sizeof(new));
+
+ new.si_signo = from->si_signo;
+ new.si_errno = from->si_errno;
+ new.si_code = from->si_code;
+ switch(siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ break;
+ case SIL_TIMER:
+ new.si_tid = from->si_tid;
+ new.si_overrun = from->si_overrun;
+ new.si_int = from->si_int;
+ break;
+ case SIL_POLL:
+ new.si_band = from->si_band;
+ new.si_fd = from->si_fd;
+ break;
+ case SIL_FAULT:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+ if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
+ new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef BUS_MCEERR_AO
+ if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
+ new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+ if ((from->si_signo == SIGSEGV) &&
+ (from->si_code == SEGV_BNDERR)) {
+ new.si_lower = ptr_to_compat(from->si_lower);
+ new.si_upper = ptr_to_compat(from->si_upper);
+ }
+#endif
+#ifdef SEGV_PKUERR
+ if ((from->si_signo == SIGSEGV) &&
+ (from->si_code == SEGV_PKUERR))
+ new.si_pkey = from->si_pkey;
+#endif
+
+ break;
+ case SIL_CHLD:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ new.si_status = from->si_status;
+#ifdef CONFIG_X86_X32_ABI
+ if (x32_ABI) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ } else
+#endif
+ {
+ new.si_utime = from->si_utime;
+ new.si_stime = from->si_stime;
+ }
+ break;
+ case SIL_RT:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ new.si_int = from->si_int;
+ break;
+ case SIL_SYS:
+ new.si_call_addr = ptr_to_compat(from->si_call_addr);
+ new.si_syscall = from->si_syscall;
+ new.si_arch = from->si_arch;
+ break;
+ }
+
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int copy_siginfo_from_user32(struct siginfo *to,
+ const struct compat_siginfo __user *ufrom)
+{
+ struct compat_siginfo from;
+
+ if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+
+ clear_siginfo(to);
+ to->si_signo = from.si_signo;
+ to->si_errno = from.si_errno;
+ to->si_code = from.si_code;
+ switch(siginfo_layout(from.si_signo, from.si_code)) {
+ case SIL_KILL:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ break;
+ case SIL_TIMER:
+ to->si_tid = from.si_tid;
+ to->si_overrun = from.si_overrun;
+ to->si_int = from.si_int;
+ break;
+ case SIL_POLL:
+ to->si_band = from.si_band;
+ to->si_fd = from.si_fd;
+ break;
+ case SIL_FAULT:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+ if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
+ to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef BUS_MCEER_AO
+ if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
+ to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+ if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
+ to->si_lower = compat_ptr(from.si_lower);
+ to->si_upper = compat_ptr(from.si_upper);
+ }
+#endif
+#ifdef SEGV_PKUERR
+ if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
+ to->si_pkey = from.si_pkey;
+#endif
+ break;
+ case SIL_CHLD:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ to->si_status = from.si_status;
+#ifdef CONFIG_X86_X32_ABI
+ if (in_x32_syscall()) {
+ to->si_utime = from._sifields._sigchld_x32._utime;
+ to->si_stime = from._sifields._sigchld_x32._stime;
+ } else
#endif
+ {
+ to->si_utime = from.si_utime;
+ to->si_stime = from.si_stime;
+ }
+ break;
+ case SIL_RT:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ to->si_int = from.si_int;
+ break;
+ case SIL_SYS:
+ to->si_call_addr = compat_ptr(from.si_call_addr);
+ to->si_syscall = from.si_syscall;
+ to->si_arch = from.si_arch;
+ break;
+ }
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
/**
* do_sigtimedwait - wait for queued signals specified in @which
@@ -2937,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
@@ -2978,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info = {};
+ struct siginfo info;
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
@@ -3060,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info = {};
+ siginfo_t info;
int ret = copy_siginfo_from_user32(&info, uinfo);
if (unlikely(ret))
return ret;
@@ -3104,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info = {};
+ siginfo_t info;
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
@@ -3677,6 +3988,7 @@ void __init signals_init(void)
/* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
!= offsetof(struct siginfo, _sifields._pad));
+ BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
@@ -3684,26 +3996,25 @@ void __init signals_init(void)
#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
- * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * kdb_send_sig - Allows kdb to send signals without exposing
* signal internals. This function checks if the required locks are
* available before calling the main signal code, to avoid kdb
* deadlocks.
*/
-void
-kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+void kdb_send_sig(struct task_struct *t, int sig)
{
static struct task_struct *kdb_prev_t;
- int sig, new_t;
+ int new_t, ret;
if (!spin_trylock(&t->sighand->siglock)) {
kdb_printf("Can't do kill command now.\n"
"The sigmask lock is held somewhere else in "
"kernel, try again later\n");
return;
}
- spin_unlock(&t->sighand->siglock);
new_t = kdb_prev_t != t;
kdb_prev_t = t;
if (t->state != TASK_RUNNING && new_t) {
+ spin_unlock(&t->sighand->siglock);
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
"on the run queue locks. "
@@ -3712,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
"the deadlock.\n");
return;
}
- sig = info->si_signo;
- if (send_sig_info(sig, info, t))
+ ret = send_signal(sig, SEND_SIG_PRIV, t, false);
+ spin_unlock(&t->sighand->siglock);
+ if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
sig, t->pid);
else
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2f5e87f1bae2..24d243ef8e71 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -665,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu)
*/
__do_softirq();
local_irq_enable();
- cond_resched_rcu_qs();
+ cond_resched();
return;
}
local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index 83ffd7dccf23..f2289de20e19 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid);
*/
int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
-int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 557d46728577..f98f28c12020 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
@@ -1374,13 +1376,6 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "hugepages_treat_as_movable",
- .data = &hugepages_treat_as_movable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{
.procname = "nr_overcommit_hugepages",
.data = NULL,
@@ -1819,8 +1814,7 @@ static struct ctl_table fs_table[] = {
.data = &pipe_max_size,
.maxlen = sizeof(pipe_max_size),
.mode = 0644,
- .proc_handler = &pipe_proc_fn,
- .extra1 = &pipe_min_size,
+ .proc_handler = proc_dopipe_max_size,
},
{
.procname = "pipe-user-pages-hard",
@@ -2622,29 +2616,17 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
-struct do_proc_dopipe_max_size_conv_param {
- unsigned int *min;
-};
-
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
- struct do_proc_dopipe_max_size_conv_param *param = data;
-
if (write) {
unsigned int val;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
-
val = round_pipe_size(*lvalp);
if (val == 0)
return -EINVAL;
- if (param->min && *param->min > val)
- return -ERANGE;
-
*valp = val;
} else {
unsigned int val = *valp;
@@ -2654,14 +2636,11 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
return 0;
}
-int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- struct do_proc_dopipe_max_size_conv_param param = {
- .min = (unsigned int *) table->extra1,
- };
return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, &param);
+ do_proc_dopipe_max_size_conv, NULL);
}
static void validate_coredump_safety(void)
@@ -3167,12 +3146,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
-int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3216,7 +3189,6 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4559e914452b..4e62a4a8fa91 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -194,11 +194,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
{
struct task_struct *tsk;
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
+ tsk = find_get_task_by_vpid(pid);
if (!tsk)
return -ESRCH;
fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
+ select CPU_ISOLATION
help
Adaptively try to shutdown the tick whenever possible, even when
the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..23788100e214 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -37,7 +37,6 @@
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
-#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/seq_file.h>
@@ -60,6 +59,15 @@
#include "tick-internal.h"
/*
+ * Masks for selecting the soft and hard context timers from
+ * cpu_base->active
+ */
+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+
+/*
* The timer bases:
*
* There are more clockids than hrtimer bases. Thus, we index
@@ -70,7 +78,6 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -93,6 +100,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
+ {
+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME_SOFT,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ },
+ {
+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ },
+ {
+ .index = HRTIMER_BASE_TAI_SOFT,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ },
}
};
@@ -118,7 +145,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .seq = SEQCNT_ZERO(migration_cpu_base),
.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};
@@ -156,45 +182,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * With HIGHRES=y we do not migrate the timer when it is expiring
- * before the next event on the target cpu because we cannot reprogram
- * the target cpu hardware and we would cause it to fire late.
+ * We do not migrate the timer when it is expiring before the next
+ * event on the target cpu. When high resolution is enabled, we cannot
+ * reprogram the target cpu hardware and we would cause it to fire
+ * late. To keep it simple, we handle the high resolution enabled and
+ * disabled case similar.
*
* Called with cpu_base->lock of target cpu held.
*/
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires;
- if (!new_base->cpu_base->hres_active)
- return 0;
-
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires <= new_base->cpu_base->expires_next;
-#else
- return 0;
-#endif
+ return expires < new_base->cpu_base->expires_next;
}
-#ifdef CONFIG_NO_HZ_COMMON
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return base;
- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
-}
-#else
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) && !pinned)
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+#endif
return base;
}
-#endif
/*
* We switch the timer base to a power-optimized selected CPU target,
@@ -396,7 +410,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
debug_object_init(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer)
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -429,8 +444,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
+
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
@@ -442,10 +459,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
trace_hrtimer_init(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer)
+static inline void debug_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
- debug_hrtimer_activate(timer);
- trace_hrtimer_start(timer);
+ debug_hrtimer_activate(timer, mode);
+ trace_hrtimer_start(timer, mode);
}
static inline void debug_deactivate(struct hrtimer *timer)
@@ -454,35 +472,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *timer)
+static struct hrtimer_clock_base *
+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
- cpu_base->next_timer = timer;
-#endif
+ unsigned int idx;
+
+ if (!*active)
+ return NULL;
+
+ idx = __ffs(*active);
+ *active &= ~(1U << idx);
+
+ return &cpu_base->clock_base[idx];
}
-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+#define for_each_active_base(base, cpu_base, active) \
+ while ((base = __next_base((cpu_base), &(active))))
+
+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ unsigned int active,
+ ktime_t expires_next)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
- ktime_t expires, expires_next = KTIME_MAX;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- hrtimer_update_next_timer(cpu_base, NULL);
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
- if (!(active & 0x01))
- continue;
-
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
- hrtimer_update_next_timer(cpu_base, timer);
+ if (timer->is_soft)
+ cpu_base->softirq_next_timer = timer;
+ else
+ cpu_base->next_timer = timer;
}
}
/*
@@ -494,7 +520,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
expires_next = 0;
return expires_next;
}
-#endif
+
+/*
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ *
+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
+ * those timers will get run whenever the softirq gets handled, at the end of
+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
+ *
+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
+ *
+ * @active_mask must be one of:
+ * - HRTIMER_ACTIVE_ALL,
+ * - HRTIMER_ACTIVE_SOFT, or
+ * - HRTIMER_ACTIVE_HARD.
+ */
+static ktime_t
+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+{
+ unsigned int active;
+ struct hrtimer *next_timer = NULL;
+ ktime_t expires_next = KTIME_MAX;
+
+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ cpu_base->softirq_next_timer = NULL;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+
+ next_timer = cpu_base->softirq_next_timer;
+ }
+
+ if (active_mask & HRTIMER_ACTIVE_HARD) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ cpu_base->next_timer = next_timer;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ }
+
+ return expires_next;
+}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
@@ -502,36 +568,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
offs_real, offs_boot, offs_tai);
-}
-/* High resolution timer related functions */
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer enabled ?
- */
-static bool hrtimer_hres_enabled __read_mostly = true;
-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
-EXPORT_SYMBOL_GPL(hrtimer_resolution);
-
-/*
- * Enable / Disable high resolution mode
- */
-static int __init setup_hrtimer_hres(char *str)
-{
- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
-}
-
-__setup("highres=", setup_hrtimer_hres);
+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
-{
- return hrtimer_hres_enabled;
+ return now;
}
/*
@@ -539,7 +583,8 @@ static inline int hrtimer_is_hres_enabled(void)
*/
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
- return cpu_base->hres_active;
+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
+ cpu_base->hres_active : 0;
}
static inline int hrtimer_hres_active(void)
@@ -557,10 +602,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- if (!cpu_base->hres_active)
- return;
+ /*
+ * Find the current next expiration time.
+ */
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- expires_next = __hrtimer_get_next_event(cpu_base);
+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
+ /*
+ * When the softirq is activated, hrtimer has to be
+ * programmed with the first hard hrtimer because soft
+ * timer interrupt could occur too late.
+ */
+ if (cpu_base->softirq_activated)
+ expires_next = __hrtimer_get_next_event(cpu_base,
+ HRTIMER_ACTIVE_HARD);
+ else
+ cpu_base->softirq_expires_next = expires_next;
+ }
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -568,6 +626,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next = expires_next;
/*
+ * If hres is not active, hardware does not have to be
+ * reprogrammed yet.
+ *
* If a hang was detected in the last timer interrupt then we
* leave the hang delay active in the hardware. We want the
* system to make progress. That also prevents the following
@@ -581,81 +642,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* set. So we'd effectivly block all timers until the T2 event
* fires.
*/
- if (cpu_base->hang_detected)
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(cpu_base->expires_next, 1);
}
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
/*
- * When a timer is enqueued and expires earlier than the already enqueued
- * timers, we have to check, whether it expires earlier than the timer for
- * which the clock event device was armed.
- *
- * Called with interrupts disabled and base->cpu_base.lock held
+ * High resolution timer enabled ?
*/
-static void hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
-
- /*
- * If the timer is not on the current cpu, we cannot reprogram
- * the other cpus clock event device.
- */
- if (base->cpu_base != cpu_base)
- return;
-
- /*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
- */
- if (cpu_base->in_hrtirq)
- return;
-
- /*
- * CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Set it to 0.
- */
- if (expires < 0)
- expires = 0;
-
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
- cpu_base->next_timer = timer;
-
- /*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
- */
- if (cpu_base->hang_detected)
- return;
+static bool hrtimer_hres_enabled __read_mostly = true;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
- /*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
- */
- cpu_base->expires_next = expires;
- tick_program_event(expires, 1);
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
+__setup("highres=", setup_hrtimer_hres);
+
/*
- * Initialize the high resolution related parts of cpu_base
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
*/
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+static inline int hrtimer_is_hres_enabled(void)
{
- base->expires_next = KTIME_MAX;
- base->hres_active = 0;
+ return hrtimer_hres_enabled;
}
/*
@@ -667,7 +685,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!base->hres_active)
+ if (!__hrtimer_hres_active(base))
return;
raw_spin_lock(&base->lock);
@@ -714,23 +732,102 @@ void clock_was_set_delayed(void)
#else
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
-static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
-static inline void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return 0;
-}
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ struct hrtimer_clock_base *base = timer->base;
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Set it to 0.
+ */
+ if (expires < 0)
+ expires = 0;
+
+ if (timer->is_soft) {
+ /*
+ * soft hrtimer could be started on a remote CPU. In this
+ * case softirq_expires_next needs to be updated on the
+ * remote CPU. The soft hrtimer will not expire before the
+ * first hard hrtimer on the remote CPU -
+ * hrtimer_check_target() prevents this case.
+ */
+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
+
+ if (timer_cpu_base->softirq_activated)
+ return;
+
+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
+ return;
+
+ timer_cpu_base->softirq_next_timer = timer;
+ timer_cpu_base->softirq_expires_next = expires;
+
+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
+ !reprogram)
+ return;
+ }
+
+ /*
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
+ */
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
+
+ if (expires >= cpu_base->expires_next)
+ return;
+
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
+ cpu_base->expires_next = expires;
+
+ /*
+ * If hres is not active, hardware does not have to be
+ * programmed yet.
+ *
+ * If a hang was detected in the last timer interrupt then we
+ * do not schedule a timer which is earlier than the expiry
+ * which we enforced in the hang detection. We want the system
+ * to make progress.
+ */
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ return;
+
+ /*
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
+ */
+ tick_program_event(expires, 1);
+}
+
+/*
* Clock realtime was set
*
* Change the offset of the realtime clock vs. the monotonic
@@ -835,9 +932,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* Returns 1 when the new timer is the leftmost timer in the tree.
*/
static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
- debug_activate(timer);
+ debug_activate(timer, mode);
base->cpu_base->active_bases |= 1 << base->index;
@@ -870,7 +968,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
-#ifdef CONFIG_HIGH_RES_TIMERS
/*
* Note: If reprogram is false we do not update
* cpu_base->next_timer. This happens when we remove the first
@@ -881,7 +978,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
*/
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
-#endif
}
/*
@@ -930,22 +1026,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- */
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+static void
+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- struct hrtimer_clock_base *base, *new_base;
- unsigned long flags;
- int leftmost;
+ ktime_t expires;
- base = lock_hrtimer_base(timer, &flags);
+ /*
+ * Find the next SOFT expiration.
+ */
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+
+ /*
+ * reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time than the next hard
+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
+ */
+ if (expires == KTIME_MAX)
+ return;
+
+ /*
+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ */
+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
+}
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode,
+ struct hrtimer_clock_base *base)
+{
+ struct hrtimer_clock_base *new_base;
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
@@ -960,21 +1070,35 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- leftmost = enqueue_hrtimer(timer, new_base);
- if (!leftmost)
- goto unlock;
+ return enqueue_hrtimer(timer, new_base, mode);
+}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ * softirq based mode is considered for debug purpose only!
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+
+ /*
+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+ * match.
+ */
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
+ hrtimer_reprogram(timer, true);
- if (!hrtimer_is_hres_active(timer)) {
- /*
- * Kick to reschedule the next tick to handle the new timer
- * on dynticks target.
- */
- if (new_base->cpu_base->nohz_active)
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else {
- hrtimer_reprogram(timer, new_base);
- }
-unlock:
unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1072,7 +1196,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base);
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1095,17 +1219,24 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
+ bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
+ int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
struct hrtimer_cpu_base *cpu_base;
- int base;
memset(timer, 0, sizeof(struct hrtimer));
cpu_base = raw_cpu_ptr(&hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ /*
+ * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
+ * clock modifications, so they needs to become CLOCK_MONOTONIC to
+ * ensure POSIX compliance.
+ */
+ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
- base = hrtimer_clockid_to_base(clock_id);
+ base += hrtimer_clockid_to_base(clock_id);
+ timer->is_soft = softtimer;
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1114,7 +1245,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: timer mode abs/rel
+ * @mode: The modes which are relevant for intitialization:
+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
+ * HRTIMER_MODE_REL_SOFT
+ *
+ * The PINNED variants of the above can be handed in,
+ * but the PINNED bit is ignored as pinning happens
+ * when the hrtimer is started
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
@@ -1133,19 +1270,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
*/
bool hrtimer_active(const struct hrtimer *timer)
{
- struct hrtimer_cpu_base *cpu_base;
+ struct hrtimer_clock_base *base;
unsigned int seq;
do {
- cpu_base = READ_ONCE(timer->base->cpu_base);
- seq = raw_read_seqcount_begin(&cpu_base->seq);
+ base = READ_ONCE(timer->base);
+ seq = raw_read_seqcount_begin(&base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
- cpu_base->running == timer)
+ base->running == timer)
return true;
- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
- cpu_base != READ_ONCE(timer->base->cpu_base));
+ } while (read_seqcount_retry(&base->seq, seq) ||
+ base != READ_ONCE(timer->base));
return false;
}
@@ -1171,7 +1308,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now)
+ struct hrtimer *timer, ktime_t *now,
+ unsigned long flags)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1179,16 +1317,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- cpu_base->running = timer;
+ base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
@@ -1202,15 +1340,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
timer->is_rel = false;
/*
- * Because we run timers from hardirq context, there is no chance
- * they get migrated to another cpu, therefore its safe to unlock
- * the timer base.
+ * The timer is marked as running in the CPU base, so it is
+ * protected against migration to a different CPU even if the lock
+ * is dropped.
*/
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
restart = fn(timer);
trace_hrtimer_expire_exit(timer);
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irq(&cpu_base->lock);
/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1223,33 +1361,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
*/
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base);
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running.timer == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
- WARN_ON_ONCE(cpu_base->running != timer);
- cpu_base->running = NULL;
+ WARN_ON_ONCE(base->running != timer);
+ base->running = NULL;
}
-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
+ unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
+ struct hrtimer_clock_base *base;
+ unsigned int active = cpu_base->active_bases & active_mask;
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(active & 0x01))
- continue;
-
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1272,11 +1408,28 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
}
}
}
+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
+ ktime_t now;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
+
+ cpu_base->softirq_activated = 0;
+ hrtimer_update_softirq_timer(cpu_base, true);
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1287,13 +1440,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
+ unsigned long flags;
int retries = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
@@ -1306,17 +1460,23 @@ retry:
*/
cpu_base->expires_next = KTIME_MAX;
- __hrtimer_run_queues(cpu_base, now);
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base);
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
/* Reprogramming necessary ? */
if (!tick_program_event(expires_next, 0)) {
@@ -1337,7 +1497,7 @@ retry:
* Acquire base lock for updating the offsets and retrieving
* the current time.
*/
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
@@ -1350,7 +1510,8 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
@@ -1392,6 +1553,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
ktime_t now;
if (__hrtimer_hres_active(cpu_base))
@@ -1409,10 +1571,17 @@ void hrtimer_run_queues(void)
return;
}
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- __hrtimer_run_queues(cpu_base, now);
- raw_spin_unlock(&cpu_base->lock);
+
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
/*
@@ -1590,7 +1759,13 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
- hrtimer_init_hres(cpu_base);
+ cpu_base->active_bases = 0;
+ cpu_base->hres_active = 0;
+ cpu_base->hang_detected = 0;
+ cpu_base->next_timer = NULL;
+ cpu_base->softirq_next_timer = NULL;
+ cpu_base->expires_next = KTIME_MAX;
+ cpu_base->softirq_expires_next = KTIME_MAX;
return 0;
}
@@ -1622,7 +1797,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
}
}
@@ -1634,6 +1809,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);
+ /*
+ * this BH disable ensures that raise_softirq_irqoff() does
+ * not wakeup ksoftirqd (and acquire the pi-lock) while
+ * holding the cpu_base lock
+ */
+ local_bh_disable();
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
@@ -1649,12 +1830,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
&new_base->clock_base[i]);
}
+ /*
+ * The migration might have changed the first expiring softirq
+ * timer on this CPU. Update it.
+ */
+ hrtimer_update_softirq_timer(new_base, false);
+
raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ local_bh_enable();
return 0;
}
@@ -1663,18 +1851,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
+ * @mode: timer mode
+ * @clock_id: timer clock to be used
*/
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode, int clock)
+ const enum hrtimer_mode mode, clockid_t clock_id)
{
struct hrtimer_sleeper t;
@@ -1695,7 +1884,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock, mode);
+ hrtimer_init_on_stack(&t.timer, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_init_sleeper(&t, current);
@@ -1717,7 +1906,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
@@ -1756,7 +1945,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
/**
* schedule_hrtimeout - sleep until timeout
* @expires: timeout value (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 17cdc554c9fe..fe56c4e06c51 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -68,13 +68,13 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return err;
}
-static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
struct posix_clock *clk = get_posix_clock(fp);
- unsigned int result = 0;
+ __poll_t result = 0;
if (!clk)
- return POLLERR;
+ return EPOLLERR;
if (clk->ops.poll)
result = clk->ops.poll(clk, fp, wait);
@@ -216,7 +216,7 @@ struct posix_clock_desc {
static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
{
- struct file *fp = fget(CLOCKID_TO_FD(id));
+ struct file *fp = fget(clockid_to_fd(id));
int err = -EINVAL;
if (!fp)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1f27887aa194..2541bd89f20e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -14,6 +14,7 @@
#include <linux/tick.h>
#include <linux/workqueue.h>
#include <linux/compat.h>
+#include <linux/sched/deadline.h>
#include "posix-timers.h"
@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers,
return 0;
}
+static inline void check_dl_overrun(struct task_struct *tsk)
+{
+ if (tsk->dl.dl_overrun) {
+ tsk->dl.dl_overrun = 0;
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ }
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk,
u64 expires;
unsigned long soft;
+ if (dl_task(tsk))
+ check_dl_overrun(tsk);
+
/*
* If cputime_expires is zero, then there are no active
* per thread CPU timers.
@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk,
struct task_cputime cputime;
unsigned long soft;
+ if (dl_task(tsk))
+ check_dl_overrun(tsk);
+
/*
* If cputimer is not running, then there are no active
* process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
return 1;
}
+ if (dl_task(tsk) && tsk->dl.dl_overrun)
+ return 1;
+
return 0;
}
@@ -1189,9 +1207,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
u64 now;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
- cpu_timer_sample_group(clock_idx, tsk, &now);
- if (oldval) {
+ if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
/*
* We are setting itimer. The *oldval is absolute and we update
* it to be relative, *newval argument is relative and we update
@@ -1363,8 +1380,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
-#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
-#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED)
static int process_cpu_clock_getres(const clockid_t which_clock,
struct timespec64 *tp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..75043046914e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
{
struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
- !same_thread_group(rtn, current) ||
- (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ switch (event->sigev_notify) {
+ case SIGEV_SIGNAL | SIGEV_THREAD_ID:
+ rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+ if (!rtn || !same_thread_group(rtn, current))
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_SIGNAL:
+ case SIGEV_THREAD:
+ if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_NONE:
+ return task_pid(rtn);
+ default:
return NULL;
-
- if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
- return NULL;
-
- return task_pid(rtn);
+ }
}
static struct k_itimer * alloc_posix_timer(void)
@@ -457,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void)
kmem_cache_free(posix_timers_cache, tmr);
return NULL;
}
- memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
+ clear_siginfo(&tmr->sigq->info);
return tmr;
}
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
struct timespec64 ts64;
bool sig_none;
- sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sig_none = timr->it_sigev_notify == SIGEV_NONE;
iv = timr->it_interval;
/* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
- sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f8e1845aa464..e277284c2831 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
-#else
+extern void timers_update_nohz(void);
+# ifdef CONFIG_SMP
+extern struct static_key_false timers_migration_enabled;
+# endif
+#else /* CONFIG_NO_HZ_COMMON */
+static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(bool update_nohz);
-#else
-static inline void timers_update_migration(bool update_nohz) { }
-#endif
-
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..29a5733eff83 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
ts->next_tick = 0;
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
}
/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+ return ts->idle_calls;
+}
+
+/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
@@ -1079,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
- timers_update_migration(true);
+ timers_update_nohz();
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..48150ab42de9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -200,8 +200,6 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
- bool migration_enabled;
- bool nohz_active;
bool is_idle;
bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
@@ -210,45 +208,64 @@ struct timer_base {
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
+
+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
+static DEFINE_MUTEX(timer_keys_mutex);
+
+static void timer_update_keys(struct work_struct *work);
+static DECLARE_WORK(timer_update_work, timer_update_keys);
+
+#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;
-void timers_update_migration(bool update_nohz)
+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
+
+static void timers_update_migration(void)
{
- bool on = sysctl_timer_migration && tick_nohz_active;
- unsigned int cpu;
+ if (sysctl_timer_migration && tick_nohz_active)
+ static_branch_enable(&timers_migration_enabled);
+ else
+ static_branch_disable(&timers_migration_enabled);
+}
+#else
+static inline void timers_update_migration(void) { }
+#endif /* !CONFIG_SMP */
- /* Avoid the loop, if nothing to update */
- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
- return;
+static void timer_update_keys(struct work_struct *work)
+{
+ mutex_lock(&timer_keys_mutex);
+ timers_update_migration();
+ static_branch_enable(&timers_nohz_active);
+ mutex_unlock(&timer_keys_mutex);
+}
- for_each_possible_cpu(cpu) {
- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
- if (!update_nohz)
- continue;
- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
- }
+void timers_update_nohz(void)
+{
+ schedule_work(&timer_update_work);
}
int timer_migration_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- static DEFINE_MUTEX(mutex);
int ret;
- mutex_lock(&mutex);
+ mutex_lock(&timer_keys_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
- timers_update_migration(false);
- mutex_unlock(&mutex);
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
return ret;
}
-#endif
+
+static inline bool is_timers_nohz_active(void)
+{
+ return static_branch_unlikely(&timers_nohz_active);
+}
+#else
+static inline bool is_timers_nohz_active(void) { return false; }
+#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -534,7 +551,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!is_timers_nohz_active())
return;
/*
@@ -823,11 +840,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
@@ -837,11 +853,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
@@ -851,21 +866,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
-#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
-#ifdef CONFIG_SMP
- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
- return get_timer_this_cpu_base(tflags);
- return get_timer_cpu_base(tflags, get_nohz_timer_target());
-#else
- return get_timer_this_cpu_base(tflags);
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) &&
+ !(tflags & TIMER_PINNED))
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
+ return get_timer_this_cpu_base(tflags);
}
static inline void forward_timer_base(struct timer_base *base)
{
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long jnow;
/*
@@ -889,16 +903,8 @@ static inline void forward_timer_base(struct timer_base *base)
base->clk = jnow;
else
base->clk = base->next_expiry;
-}
-#else
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- return get_timer_this_cpu_base(tflags);
-}
-
-static inline void forward_timer_base(struct timer_base *base) { }
#endif
+}
/*
@@ -1009,8 +1015,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
- debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
@@ -1034,6 +1038,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
}
+ debug_activate(timer, expires);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1690,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
base->must_forward_clk = false;
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1698,7 +1704,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
@@ -1855,6 +1861,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
}
}
+int timers_prepare_cpu(unsigned int cpu)
+{
+ struct timer_base *base;
+ int b;
+
+ for (b = 0; b < NR_BASES; b++) {
+ base = per_cpu_ptr(&timer_bases[b], cpu);
+ base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->is_idle = false;
+ base->must_forward_clk = true;
+ }
+ return 0;
+}
+
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
diff --git a/kernel/torture.c b/kernel/torture.c
index 637e172835d8..37b94012a3f8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -47,6 +47,7 @@
#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
@@ -60,7 +61,6 @@ static bool verbose;
#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
static int fullstop = FULLSTOP_RMMOD;
static DEFINE_MUTEX(fullstop_mutex);
-static int *torture_runnable;
#ifdef CONFIG_HOTPLUG_CPU
@@ -500,7 +500,7 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
- ftrace_dump(DUMP_ALL);
+ rcu_ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
@@ -572,17 +572,19 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
+ int spt;
+
cond_resched_rcu_qs();
- while (READ_ONCE(stutter_pause_test) ||
- (torture_runnable && !READ_ONCE(*torture_runnable))) {
- if (stutter_pause_test)
- if (READ_ONCE(stutter_pause_test) == 1)
- schedule_timeout_interruptible(1);
- else
- while (READ_ONCE(stutter_pause_test))
- cond_resched();
- else
+ spt = READ_ONCE(stutter_pause_test);
+ for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+ if (spt == 1) {
+ schedule_timeout_interruptible(1);
+ } else if (spt == 2) {
+ while (READ_ONCE(stutter_pause_test))
+ cond_resched();
+ } else {
schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ }
torture_shutdown_absorb(title);
}
}
@@ -596,17 +598,15 @@ static int torture_stutter(void *arg)
{
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
- if (!torture_must_stop()) {
- if (stutter > 1) {
- schedule_timeout_interruptible(stutter - 1);
- WRITE_ONCE(stutter_pause_test, 2);
- }
- schedule_timeout_interruptible(1);
+ if (!torture_must_stop() && stutter > 1) {
WRITE_ONCE(stutter_pause_test, 1);
+ schedule_timeout_interruptible(stutter - 1);
+ WRITE_ONCE(stutter_pause_test, 2);
+ schedule_timeout_interruptible(1);
}
+ WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
schedule_timeout_interruptible(stutter);
- WRITE_ONCE(stutter_pause_test, 0);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void)
* The runnable parameter points to a flag that controls whether or not
* the test is currently runnable. If there is no such flag, pass in NULL.
*/
-bool torture_init_begin(char *ttype, bool v, int *runnable)
+bool torture_init_begin(char *ttype, bool v)
{
mutex_lock(&fullstop_mutex);
if (torture_type != NULL) {
@@ -659,7 +659,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
}
torture_type = ttype;
verbose = v;
- torture_runnable = runnable;
fullstop = FULLSTOP_DONTSTOP;
return true;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..0b249e2f0c3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
bool "Enable trace events for preempt and irq disable/enable"
select TRACE_IRQFLAGS
depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ depends on TRACING
default n
help
Enable tracing of disable and enable events for preemption and irqs.
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
@@ -529,6 +530,15 @@ config FUNCTION_PROFILER
If in doubt, say N.
+config BPF_KPROBE_OVERRIDE
+ bool "Enable BPF programs to override a kprobed function"
+ depends on BPF_EVENTS
+ depends on FUNCTION_ERROR_INJECTION
+ default n
+ help
+ Allows BPF to override the execution of a probed function and
+ set a different return value. This is used for error injection.
+
config FTRACE_MCOUNT_RECORD
def_bool y
depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ce99c379c30..fc2838ac8b78 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <linux/error-injection.h>
+
+#include "trace_probe.h"
#include "trace.h"
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
}
EXPORT_SYMBOL_GPL(trace_call_bpf);
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+ regs_set_return_value(regs, rc);
+ override_function_with_return(regs);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+ .func = bpf_override_return,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+#endif
+
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
int ret;
@@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
*/
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
#define __BPF_TP(...) \
- __trace_printk(1 /* Fake ip will not be printed. */, \
+ __trace_printk(0 /* Fake ip */, \
fmt, ##__VA_ARGS__)
#define __BPF_ARG1_TP(...) \
@@ -343,14 +364,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_raw_record *raw)
+ u64 flags, struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
@@ -373,8 +393,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(sd, 0, 0);
- sd->raw = raw;
perf_event_output(event, sd, regs);
return 0;
}
@@ -382,6 +400,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -392,7 +411,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
+
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +429,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
struct perf_raw_frag frag = {
.copy = ctx_copy,
@@ -428,8 +452,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
perf_fetch_caller_regs(regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
BPF_CALL_0(bpf_get_current_task)
@@ -551,6 +577,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_stackid_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+ case BPF_FUNC_override_return:
+ return &bpf_override_return_proto;
+#endif
default:
return tracing_func_proto(func_id);
}
@@ -768,6 +798,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
struct bpf_prog_array *new_array;
int ret = -EEXIST;
+ /*
+ * Kprobe override only works if they are on the function entry,
+ * and only if they are on the opt-in list.
+ */
+ if (prog->kprobe_override &&
+ (!trace_kprobe_on_func_entry(event->tp_event) ||
+ !trace_kprobe_error_injectable(event->tp_event)))
+ return -EINVAL;
+
mutex_lock(&bpf_event_mutex);
if (event->prog)
@@ -820,3 +859,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
unlock:
mutex_unlock(&bpf_event_mutex);
}
+
+int perf_event_query_prog_array(struct perf_event *event, void __user *info)
+{
+ struct perf_event_query_bpf __user *uquery = info;
+ struct perf_event_query_bpf query = {};
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+ if (copy_from_user(&query, uquery, sizeof(query)))
+ return -EFAULT;
+
+ mutex_lock(&bpf_event_mutex);
+ ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
+ uquery->ids,
+ query.ids_len,
+ &uquery->prog_cnt);
+ mutex_unlock(&bpf_event_mutex);
+
+ return ret;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..eac9ce2c57a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
};
/*
- * This is used by __kernel_text_address() to return true if the
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
*/
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
- struct ftrace_ops *op;
- bool ret = false;
+ struct ftrace_ops *op = NULL;
/*
* Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
if (op->trampoline && op->trampoline_size)
if (addr >= op->trampoline &&
addr < op->trampoline + op->trampoline_size) {
- ret = true;
- goto out;
+ preempt_enable_notrace();
+ return op;
}
} while_for_each_ftrace_op(op);
-
- out:
preempt_enable_notrace();
- return ret;
+ return NULL;
+}
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ return ftrace_ops_trampoline(addr) != NULL;
}
struct ftrace_page {
@@ -4451,7 +4456,6 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
func_g.type = filter_parse_regex(glob, strlen(glob),
&func_g.search, &not);
func_g.len = strlen(func_g.search);
- func_g.search = glob;
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -5010,7 +5014,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
- parser->buffer[parser->idx] = 0;
ftrace_match_records(iter->hash, parser->buffer, parser->idx);
}
@@ -5324,7 +5327,6 @@ ftrace_graph_release(struct inode *inode, struct file *file)
parser = &fgd->parser;
if (trace_parser_loaded((parser))) {
- parser->buffer[parser->idx] = 0;
ret = ftrace_graph_set_hash(fgd->new_hash,
parser->buffer);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 91874a95060d..dcf1c4dd3efe 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
@@ -623,10 +627,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*
- * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
-int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -661,7 +665,7 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static __always_inline void *
-__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
-{
- return bpage->data + index;
-}
-
static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
{
return bpage->page->data + index;
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
*
- * Normal context.
- * SoftIRQ context
- * IRQ context
- * NMI context
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
*
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- if (cpu_buffer->current_context >= 4)
+ unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
+ int bit;
+
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+
+ if (unlikely(val & (1 << bit)))
return 1;
- cpu_buffer->current_context++;
- /* Interrupts must see this update */
- barrier();
+ val |= (1 << bit);
+ cpu_buffer->current_context = val;
return 0;
}
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- /* Don't let the dec leak out */
- barrier();
- cpu_buffer->current_context--;
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
/**
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct buffer_data_page *bpage = data;
+ struct page *page = virt_to_page(bpage);
unsigned long flags;
+ /* If the page is still in use someplace else, we can't reuse it */
+ if (page_ref_count(page) > 1)
+ goto out;
+
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ out:
free_page((unsigned long)bpage);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73e67b68c53b..20a2300ae4e8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
}
/**
- * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
* @pid_list: The list to modify
* @self: The current task for fork or NULL for exit
* @task: The task to add or remove
@@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
ubuf += ret;
cnt -= ret;
- parser.buffer[parser.idx] = 0;
-
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
@@ -925,7 +923,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
}
/**
- * trace_snapshot - take a snapshot of the current buffer.
+ * tracing_snapshot - take a snapshot of the current buffer.
*
* This causes a swap between the snapshot buffer and the current live
* tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1002,9 @@ int tracing_alloc_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
/**
- * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
*
- * This is similar to trace_snapshot(), but it will allocate the
+ * This is similar to tracing_snapshot(), but it will allocate the
* snapshot buffer if it isn't already allocated. Use this only
* where it is safe to sleep, as the allocation may sleep.
*
@@ -1236,18 +1234,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
cnt--;
}
+ parser->idx = 0;
+
/* only spaces were written */
- if (isspace(ch)) {
+ if (isspace(ch) || !ch) {
*ppos += read;
ret = read;
goto out;
}
-
- parser->idx = 0;
}
/* read the non-space input */
- while (cnt && !isspace(ch)) {
+ while (cnt && !isspace(ch) && ch) {
if (parser->idx < parser->size - 1)
parser->buffer[parser->idx++] = ch;
else {
@@ -1262,12 +1260,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
}
/* We either got finished input or we have to wait for another call. */
- if (isspace(ch)) {
+ if (isspace(ch) || !ch) {
parser->buffer[parser->idx] = 0;
parser->cont = false;
} else if (parser->idx < parser->size - 1) {
parser->cont = true;
parser->buffer[parser->idx++] = ch;
+ /* Make sure the parsed string always terminates with '\0'. */
+ parser->buffer[parser->idx] = 0;
} else {
ret = -EINVAL;
goto out;
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh;
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
*/
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+/*
+ * Skip 3:
+ *
+ * trace_buffer_unlock_commit_regs()
+ * trace_event_buffer_commit()
+ * trace_event_raw_event_xxx()
+*/
+# define STACK_SKIP 3
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
/*
- * If regs is not set, then skip the following callers:
- * trace_buffer_unlock_commit_regs
- * event_trigger_unlock_commit
- * trace_event_buffer_commit
- * trace_event_raw_event_sched_switch
+ * If regs is not set, then skip the necessary functions.
* Note, we can still get here via blktrace, wakeup tracer
* and mmiotrace, but that's ok if they lose a function or
- * two. They are that meaningful.
+ * two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export,
entry = ring_buffer_event_data(event);
size = ring_buffer_event_length(event);
- export->write(entry, size);
+ export->write(export, entry, size);
}
static DEFINE_MUTEX(ftrace_export_lock);
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
- * Add two, for this function and the call to save_stack_trace()
+ * Add one, for this function and the call to save_stack_trace()
* If regs is set, then these functions will not be in the way.
*/
+#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip += 2;
+ trace.skip++;
+#endif
/*
* Since events can happen in NMIs there's no safe way to
@@ -2682,17 +2689,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
if (unlikely(in_nmi()))
return;
- /*
- * It is possible that a function is being traced in a
- * location that RCU is not watching. A call to
- * rcu_irq_enter() will make sure that it is, but there's
- * a few internal rcu functions that could be traced
- * where that wont work either. In those cases, we just
- * do nothing.
- */
- if (unlikely(rcu_irq_enter_disabled()))
- return;
-
rcu_irq_enter_irqson();
__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
rcu_irq_exit_irqson();
@@ -2711,11 +2707,10 @@ void trace_dump_stack(int skip)
local_save_flags(flags);
- /*
- * Skip 3 more, seems to get us at the caller of
- * this function.
- */
- skip += 3;
+#ifndef CONFIG_UNWINDER_ORC
+ /* Skip 1 to skip this function. */
+ skip++;
+#endif
__ftrace_trace_stack(global_trace.trace_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
@@ -4178,37 +4173,30 @@ static const struct file_operations show_traces_fops = {
.llseek = seq_lseek,
};
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
-
static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
+ char *mask_str;
int len;
- mutex_lock(&tracing_cpumask_update_lock);
+ len = snprintf(NULL, 0, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str)
+ return -ENOMEM;
- len = snprintf(mask_str, count, "%*pb\n",
+ len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
if (len >= count) {
count = -EINVAL;
goto out_err;
}
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
out_err:
- mutex_unlock(&tracing_cpumask_update_lock);
+ kfree(mask_str);
return count;
}
@@ -4228,8 +4216,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (err)
goto err_unlock;
- mutex_lock(&tracing_cpumask_update_lock);
-
local_irq_disable();
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
@@ -4252,8 +4238,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-
- mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
return count;
@@ -5632,26 +5616,26 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
return 0;
}
-static unsigned int
+static __poll_t
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
struct trace_array *tr = iter->tr;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
if (tr->trace_flags & TRACE_ITER_BLOCK)
/*
* Always select as readable when in blocking mode
*/
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
else
return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
filp, poll_table);
}
-static unsigned int
+static __poll_t
tracing_poll_pipe(struct file *filp, poll_table *poll_table)
{
struct trace_iterator *iter = filp->private_data;
@@ -6605,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
return ret;
}
-static unsigned int
+static __poll_t
tracing_buffers_poll(struct file *filp, poll_table *poll_table)
{
struct ftrace_buffer_info *info = filp->private_data;
@@ -6780,7 +6764,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6818,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -7599,6 +7575,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -7622,7 +7599,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 79f838a75077..22fee766081b 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg)
* this thread will never voluntarily schedule which would
* block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched_rcu_qs();
+ cond_resched();
}
return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..05c7172c6667 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
if (*parser.buffer == '!')
set = 0;
- parser.buffer[parser.idx] = 0;
-
ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
if (ret)
goto out_put;
@@ -2213,6 +2211,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2220,15 +2219,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 61e7f0678d33..a764aec3c9a1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -400,7 +400,6 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
for (i = 0; i < len; i++) {
if (buff[i] == '*') {
if (!i) {
- *search = buff + 1;
type = MATCH_END_ONLY;
} else if (i == len - 1) {
if (type == MATCH_END_ONLY)
@@ -410,14 +409,14 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
buff[i] = 0;
break;
} else { /* pattern continues, use full glob */
- type = MATCH_GLOB;
- break;
+ return MATCH_GLOB;
}
} else if (strchr("[?\\", buff[i])) {
- type = MATCH_GLOB;
- break;
+ return MATCH_GLOB;
}
}
+ if (buff[0] == '*')
+ *search = buff + 1;
return type;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_UNWINDER_ORC
+/* Skip 2:
+ * event_triggers_post_call()
+ * trace_event_raw_event_xxx()
+ */
+# define STACK_SKIP 2
+#else
/*
- * Skip 3:
+ * Skip 4:
* stacktrace_trigger()
* event_triggers_post_call()
+ * trace_event_buffer_commit()
* trace_event_raw_event_xxx()
*/
-#define STACK_SKIP 3
+#define STACK_SKIP 4
+#endif
static void
stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
+#ifdef CONFIG_UNWINDER_ORC
+/*
+ * Skip 2:
+ *
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 2
+#else
+/*
+ * Skip 3:
+ * __trace_stack()
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 3
+#endif
+
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
- /*
- * skip over 5 funcs:
- * __ftrace_trace_stack,
- * __trace_stack,
- * function_stack_trace_call
- * ftrace_list_func
- * ftrace_call
- */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, flags, STACK_SKIP, pc);
}
atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
tracer_tracing_off(tr);
}
+#ifdef CONFIG_UNWINDER_ORC
/*
- * Skip 4:
+ * Skip 3:
+ *
+ * function_trace_probe_call()
+ * ftrace_ops_assist_func()
+ * ftrace_call()
+ */
+#define FTRACE_STACK_SKIP 3
+#else
+/*
+ * Skip 5:
+ *
+ * __trace_stack()
* ftrace_stacktrace()
* function_trace_probe_call()
- * ftrace_ops_list_func()
+ * ftrace_ops_assist_func()
* ftrace_call()
*/
-#define STACK_SKIP 4
+#define FTRACE_STACK_SKIP 5
+#endif
static __always_inline void trace_stack(struct trace_array *tr)
{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
local_save_flags(flags);
pc = preempt_count();
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
}
static void
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/rculist.h>
+#include <linux/error-injection.h>
#include "trace_probe.h"
@@ -42,7 +43,6 @@ struct trace_kprobe {
(offsetof(struct trace_kprobe, tp.args) + \
(sizeof(struct probe_arg) * (n)))
-
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
@@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
return nhit;
}
+bool trace_kprobe_on_func_entry(struct trace_event_call *call)
+{
+ struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+
+ return kprobe_on_func_entry(tk->rp.kp.addr,
+ tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
+ tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+}
+
+bool trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+ struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+ unsigned long addr;
+
+ if (tk->symbol) {
+ addr = (unsigned long)
+ kallsyms_lookup_name(trace_kprobe_symbol(tk));
+ addr += tk->rp.kp.offset;
+ } else {
+ addr = (unsigned long)tk->rp.kp.addr;
+ }
+ return within_error_injection_list(addr);
+}
+
static int register_kprobe_event(struct trace_kprobe *tk);
static int unregister_kprobe_event(struct trace_kprobe *tk);
@@ -1170,7 +1194,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static void
+static int
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1203,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
int size, __size, dsize;
int rctx;
- if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
- return;
+ if (bpf_prog_array_valid(call)) {
+ unsigned long orig_ip = instruction_pointer(regs);
+ int ret;
+
+ ret = trace_call_bpf(call, regs);
+
+ /*
+ * We need to check and see if we modified the pc of the
+ * pt_regs, and if so clear the kprobe and return 1 so that we
+ * don't do the single stepping.
+ * The ftrace kprobe handler leaves it up to us to re-enable
+ * preemption here before returning if we've modified the ip.
+ */
+ if (orig_ip != instruction_pointer(regs)) {
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ if (!ret)
+ return 0;
+ }
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
- return;
+ return 0;
dsize = __get_data_size(&tk->tp, regs);
__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1236,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
- return;
+ return 0;
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
+ return 0;
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1275,6 +1319,7 @@ static int kprobe_register(struct trace_event_call *event,
static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+ int ret = 0;
raw_cpu_inc(*tk->nhit);
@@ -1282,9 +1327,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
kprobe_trace_func(tk, regs);
#ifdef CONFIG_PERF_EVENTS
if (tk->tp.flags & TP_FLAG_PROFILE)
- kprobe_perf_func(tk, regs);
+ ret = kprobe_perf_func(tk, regs);
#endif
- return 0; /* We don't tweek kernel, so just return 0 */
+ return ret;
}
NOKPROBE_SYMBOL(kprobe_dispatcher);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,6 +252,8 @@ struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
void free_symbol_cache(struct symbol_cache *sc);
struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+bool trace_kprobe_on_func_entry(struct trace_event_call *call);
+bool trace_kprobe_error_injectable(struct trace_event_call *call);
#else
/* uprobes do not support symbol fetch methods */
#define fetch_symbol_u8 NULL
@@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
{
return NULL;
}
+
+static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
+{
+ return false;
+}
+
+static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+ return false;
+}
#endif /* CONFIG_KPROBE_EVENTS */
struct probe_arg {
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 8cda06a10d66..c364cf777e1a 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -1,13 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
#include "trace.h"
-int DYN_FTRACE_TEST_NAME(void)
+noinline __noclone int DYN_FTRACE_TEST_NAME(void)
{
/* used to call mcount */
return 0;
}
-int DYN_FTRACE_TEST_NAME2(void)
+noinline __noclone int DYN_FTRACE_TEST_NAME2(void)
{
/* used to call mcount */
return 0;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (__this_cpu_read(disable_stack_tracer) != 1)
goto out;
+ /* If rcu is not watching, then save stack trace can fail */
+ if (!rcu_is_watching())
+ goto out;
+
ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 40592e7b3568..268029ae1be6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
/* Don't print "0x (null)" when offset is 0 */
if (tu->offset) {
- seq_printf(m, "0x%p", (void *)tu->offset);
+ seq_printf(m, "0x%px", (void *)tu->offset);
} else {
switch (sizeof(void *)) {
case 4:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 685c50ae6300..671b13457387 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp,
}
/*
- * rcu_assign_pointer has a smp_wmb() which makes sure that the new
- * probe callbacks array is consistent before setting a pointer to it.
- * This array is referenced by __DO_TRACE from
- * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
- * is used.
+ * rcu_assign_pointer has as smp_store_release() which makes sure
+ * that the new probe callbacks array is consistent before setting
+ * a pointer to it. This array is referenced by __DO_TRACE from
+ * include/linux/tracepoint.h using rcu_dereference_sched().
*/
rcu_assign_pointer(tp->funcs, tp_funcs);
if (!static_key_enabled(&tp->key))
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710bfdd7..017044c26233 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
-#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
@@ -48,6 +47,8 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
#include "workqueue_internal.h"
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
- * Sanity check nr_running. Because wq_unbind_fn() releases
+ * Sanity check nr_running. Because unbind_workers() releases
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
@@ -2135,7 +2136,7 @@ __acquires(&pool->lock)
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
- cond_resched_rcu_qs();
+ cond_resched();
spin_lock_irq(&pool->lock);
@@ -3806,6 +3807,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
return ret;
}
+EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
/**
* wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -3939,6 +3941,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
return clamp_val(max_active, 1, lim);
}
+/*
+ * Workqueues which may be used during memory reclaim should have a rescuer
+ * to guarantee forward progress.
+ */
+static int init_rescuer(struct workqueue_struct *wq)
+{
+ struct worker *rescuer;
+ int ret;
+
+ if (!(wq->flags & WQ_MEM_RECLAIM))
+ return 0;
+
+ rescuer = alloc_worker(NUMA_NO_NODE);
+ if (!rescuer)
+ return -ENOMEM;
+
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ ret = PTR_ERR_OR_ZERO(rescuer->task);
+ if (ret) {
+ kfree(rescuer);
+ return ret;
+ }
+
+ wq->rescuer = rescuer;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
+ wake_up_process(rescuer->task);
+
+ return 0;
+}
+
struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
@@ -4001,29 +4034,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;
- /*
- * Workqueues which may be used during memory reclaim should
- * have a rescuer to guarantee forward progress.
- */
- if (flags & WQ_MEM_RECLAIM) {
- struct worker *rescuer;
-
- rescuer = alloc_worker(NUMA_NO_NODE);
- if (!rescuer)
- goto err_destroy;
-
- rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
- wq->name);
- if (IS_ERR(rescuer->task)) {
- kfree(rescuer);
- goto err_destroy;
- }
-
- wq->rescuer = rescuer;
- kthread_bind_mask(rescuer->task, cpu_possible_mask);
- wake_up_process(rescuer->task);
- }
+ if (wq_online && init_rescuer(wq) < 0)
+ goto err_destroy;
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
@@ -4463,6 +4475,12 @@ void show_workqueue_state(void)
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
}
@@ -4490,6 +4508,12 @@ void show_workqueue_state(void)
pr_cont("\n");
next_pool:
spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
rcu_read_unlock_sched();
@@ -4510,9 +4534,8 @@ void show_workqueue_state(void)
* cpu comes back online.
*/
-static void wq_unbind_fn(struct work_struct *work)
+static void unbind_workers(int cpu)
{
- int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
@@ -4589,16 +4612,6 @@ static void rebind_workers(struct worker_pool *pool)
spin_lock_irq(&pool->lock);
- /*
- * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
- * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
- * being reworked and this can go away in time.
- */
- if (!(pool->flags & POOL_DISASSOCIATED)) {
- spin_unlock_irq(&pool->lock);
- return;
- }
-
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
@@ -4709,12 +4722,13 @@ int workqueue_online_cpu(unsigned int cpu)
int workqueue_offline_cpu(unsigned int cpu)
{
- struct work_struct unbind_work;
struct workqueue_struct *wq;
/* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
+ if (WARN_ON(cpu != smp_processor_id()))
+ return -1;
+
+ unbind_workers(cpu);
/* update NUMA affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4736,6 @@ int workqueue_offline_cpu(unsigned int cpu)
wq_update_unbound_numa(wq, cpu, false);
mutex_unlock(&wq_pool_mutex);
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
return 0;
}
@@ -4957,6 +4968,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
return -ENOMEM;
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
@@ -5555,7 +5570,7 @@ int __init workqueue_init_early(void)
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5638,6 +5653,8 @@ int __init workqueue_init(void)
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
+ *
+ * Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();
@@ -5649,8 +5666,12 @@ int __init workqueue_init(void)
}
}
- list_for_each_entry(wq, &workqueues, list)
+ list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
+ WARN(init_rescuer(wq),
+ "workqueue: failed to create early rescuer for %s",
+ wq->name);
+ }
mutex_unlock(&wq_pool_mutex);