aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/inode.c7
-rw-r--r--kernel/bpf/syscall.c30
-rw-r--r--kernel/bpf/verifier.c78
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/events/core.c70
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/futex.c27
-rw-r--r--kernel/irq/ipi.c1
-rw-r--r--kernel/kcov.c3
-rw-r--r--kernel/kexec_core.c7
-rw-r--r--kernel/locking/lockdep.c118
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/qspinlock_stat.h8
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/core.c47
-rw-r--r--kernel/time/tick-sched.c61
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/trace/trace_events.c9
-rw-r--r--kernel/workqueue.c29
21 files changed, 427 insertions, 137 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..8f94ca1860cf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ raw = bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- bpf_map_inc(raw, true);
+ raw = bpf_map_inc(raw, true);
break;
default:
WARN_ON_ONCE(1);
@@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
goto out;
raw = bpf_any_get(inode->i_private, *type);
- touch_atime(&path);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
path_put(&path);
return raw;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a2efe1bc76c..cf5e9f7ad13a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -137,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
- "max_entries:\t%u\n",
+ "max_entries:\t%u\n"
+ "map_flags:\t%#x\n",
map->map_type,
map->key_size,
map->value_size,
- map->max_entries);
+ map->max_entries,
+ map->map_flags);
}
#endif
@@ -216,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
{
- atomic_inc(&map->refcnt);
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
if (uref)
atomic_inc(&map->usercnt);
+ return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -232,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- bpf_map_inc(map, true);
+ map = bpf_map_inc(map, true);
fdput(f);
return map;
@@ -656,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
@@ -668,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
if (IS_ERR(prog))
return prog;
- atomic_inc(&prog->aux->refcnt);
+ prog = bpf_prog_inc(prog);
fdput(f);
return prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..c5c17a62f509 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -239,16 +239,6 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
-static const struct {
- int map_type;
- int func_id;
-} func_limit[] = {
- {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
- {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
-};
-
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -921,27 +911,52 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
{
- bool bool_map, bool_func;
- int i;
-
if (!map)
return 0;
- for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
- bool_map = (map->map_type == func_limit[i].map_type);
- bool_func = (func_id == func_limit[i].func_id);
- /* only when map & func pair match it can continue.
- * don't allow any other map type to be passed into
- * the special func;
- */
- if (bool_func && bool_map != bool_func) {
- verbose("cannot pass map_type %d into func %d\n",
- map->map_type, func_id);
- return -EINVAL;
- }
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_STACK_TRACE:
+ if (func_id != BPF_FUNC_get_stackid)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_get_stackid:
+ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+ goto error;
+ break;
+ default:
+ break;
}
return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
}
static int check_call(struct verifier_env *env, int func_id)
@@ -1374,6 +1389,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
verbose("BPF_LD_ABS uses reserved fields\n");
return -EINVAL;
@@ -2029,7 +2045,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
- fdput(f);
return PTR_ERR(map);
}
@@ -2049,15 +2064,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return -E2BIG;
}
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
-
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
- bpf_map_inc(map, false);
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
fdput(f);
next_insn:
insn++;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05c0b0f..909a7d31ffd3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
+ struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
- int ret;
+ int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
@@ -2875,8 +2876,10 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ for_each_subsys(ss, ssid)
+ if (ss->post_attach)
+ ss->post_attach();
cgroup_kn_unlock(of->kn);
- cpuset_post_attach_flush();
return ret ?: nbytes;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ea42e8da861..3e3f6e49eabb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -36,6 +36,7 @@
* @target: The target state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
+ * @rollback: Perform a rollback
* @cb_stat: The state for a single callback (install/uninstall)
* @cb: Single callback function (install/uninstall)
* @result: Result of the operation
@@ -47,6 +48,7 @@ struct cpuhp_cpu_state {
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
+ bool rollback;
enum cpuhp_state cb_state;
int (*cb)(unsigned int cpu);
int result;
@@ -301,6 +303,11 @@ static int cpu_notify(unsigned long val, unsigned int cpu)
return __cpu_notify(val, cpu, -1, NULL);
}
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+ BUG_ON(cpu_notify(val, cpu));
+}
+
/* Notifier wrappers for transitioning to state machine */
static int notify_prepare(unsigned int cpu)
{
@@ -477,6 +484,16 @@ static void cpuhp_thread_fun(unsigned int cpu)
} else {
ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
}
+ } else if (st->rollback) {
+ BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+ undo_cpu_down(cpu, st, cpuhp_ap_states);
+ /*
+ * This is a momentary workaround to keep the notifier users
+ * happy. Will go away once we got rid of the notifiers.
+ */
+ cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+ st->rollback = false;
} else {
/* Cannot happen .... */
BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
@@ -636,11 +653,6 @@ static inline void check_for_tasks(int dead_cpu)
read_unlock(&tasklist_lock);
}
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
- BUG_ON(cpu_notify(val, cpu));
-}
-
static int notify_down_prepare(unsigned int cpu)
{
int err, nr_calls = 0;
@@ -721,9 +733,10 @@ static int takedown_cpu(unsigned int cpu)
*/
err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
- /* CPU didn't die: tell everyone. Can't complain. */
- cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+ /* CPU refused to die */
irq_unlock_sparse();
+ /* Unpark the hotplug thread so we can rollback there */
+ kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
return err;
}
BUG_ON(cpu_online(cpu));
@@ -832,6 +845,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+ if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+ st->target = prev_state;
+ st->rollback = true;
+ cpuhp_kick_ap_work(cpu);
+ }
hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
@@ -1249,6 +1267,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.name = "notify:online",
.startup = notify_online,
.teardown = notify_down_prepare,
+ .skip_onerr = true,
},
#endif
/*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00ab5c2b7c5b..1902956baba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -58,7 +58,6 @@
#include <asm/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
@@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
+ .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.legacy_cftypes = files,
.early_init = true,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..4e2ebf6f2f1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- if (sysctl_perf_cpu_time_max_percent == 100) {
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
printk(KERN_WARNING
"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
WRITE_ONCE(perf_sample_allowed_ns, 0);
@@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -2417,14 +2419,24 @@ static void ctx_sched_out(struct perf_event_context *ctx,
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
-
+ /*
+ * Always update time if it was set; not only when it changes.
+ * Otherwise we can 'forget' to update time for any but the last
+ * context we sched out. For example:
+ *
+ * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
+ * ctx_sched_out(.event_type = EVENT_PINNED)
+ *
+ * would only update time for the pinned events.
+ */
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
}
+ is_active ^= ctx->is_active; /* changed bits */
+
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
@@ -3410,7 +3422,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3424,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
@@ -8403,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -8410,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}
if (is_sampling_event(event)) {
@@ -8469,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8532,6 +8547,7 @@ SYSCALL_DEFINE5(perf_event_open,
f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
+ event_file = NULL;
goto err_context;
}
@@ -8570,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8641,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);
+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();
mutex_lock(&current->perf_event_mutex);
@@ -8673,6 +8699,9 @@ err_alloc:
*/
if (!event_file)
free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -8957,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 220fc17b9718..7edc95edfaee 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -321,7 +321,7 @@ retry:
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page);
- page_cache_release(new_page);
+ put_page(new_page);
put_old:
put_page(old_page);
@@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
* see uprobe_register().
*/
if (mapping->a_ops->readpage)
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
- page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
copy_from_page(page, offset, insn, nbytes);
- page_cache_release(page);
+ put_page(page);
return 0;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74c89e0..c20f06f38ef3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1295,10 +1295,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
+ } else if (curval != uval) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1525,8 +1535,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
hb_waiters_dec(hb1);
- plist_add(&q->list, &hb2->chain);
hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -2623,6 +2633,15 @@ retry:
if (ret == -EFAULT)
goto pi_faulted;
/*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN) {
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ goto retry;
+ }
+ /*
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c37f34b00a11..14777af8e097 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -94,6 +94,7 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
data = irq_get_irq_data(virq + i);
cpumask_copy(data->common->affinity, dest);
data->common->ipi_offset = offset;
+ irq_set_status_flags(virq + i, IRQ_NO_BALANCING);
}
return virq;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3efbee0834a8..a02f2dddd1d7 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "kcov: " fmt
+#define DISABLE_BRANCH_PROFILING
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/file.h>
@@ -43,7 +44,7 @@ struct kcov {
* Entry point from instrumented code.
* This is called once per basic-block/edge.
*/
-void __sanitizer_cov_trace_pc(void)
+void notrace __sanitizer_cov_trace_pc(void)
{
struct task_struct *t;
enum kcov_mode mode;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8d34308ea449..1391d3ee3b86 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1415,6 +1415,9 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_dtor);
+ VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1447,8 +1450,8 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_X86
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
#endif
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 53ab2f85d77e..78c1c0ee6dc1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1999,6 +1999,79 @@ static inline int get_first_held_lock(struct task_struct *curr,
return ++i;
}
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Returns the next chain_key iteration
+ */
+static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+{
+ u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+
+ printk(" class_idx:%d -> chain_key:%016Lx",
+ class_idx,
+ (unsigned long long)new_chain_key);
+ return new_chain_key;
+}
+
+static void
+print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
+{
+ struct held_lock *hlock;
+ u64 chain_key = 0;
+ int depth = curr->lockdep_depth;
+ int i;
+
+ printk("depth: %u\n", depth + 1);
+ for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ hlock = curr->held_locks + i;
+ chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+
+ print_lock(hlock);
+ }
+
+ print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_lock(hlock_next);
+}
+
+static void print_chain_keys_chain(struct lock_chain *chain)
+{
+ int i;
+ u64 chain_key = 0;
+ int class_id;
+
+ printk("depth: %u\n", chain->depth);
+ for (i = 0; i < chain->depth; i++) {
+ class_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+
+ print_lock_name(lock_classes + class_id);
+ printk("\n");
+ }
+}
+
+static void print_collision(struct task_struct *curr,
+ struct held_lock *hlock_next,
+ struct lock_chain *chain)
+{
+ printk("\n");
+ printk("======================\n");
+ printk("[chain_key collision ]\n");
+ print_kernel_ident();
+ printk("----------------------\n");
+ printk("%s/%d: ", current->comm, task_pid_nr(current));
+ printk("Hash chain already cached but the contents don't match!\n");
+
+ printk("Held locks:");
+ print_chain_keys_held_locks(curr, hlock_next);
+
+ printk("Locks in cached chain:");
+ print_chain_keys_chain(chain);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+#endif
+
/*
* Checks whether the chain and the current held locks are consistent
* in depth and also in content. If they are not it most likely means
@@ -2014,14 +2087,18 @@ static int check_no_collision(struct task_struct *curr,
i = get_first_held_lock(curr, hlock);
- if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+ if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) {
+ print_collision(curr, hlock, chain);
return 0;
+ }
for (j = 0; j < chain->depth - 1; j++, i++) {
id = curr->held_locks[i].class_idx - 1;
- if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+ if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
+ print_collision(curr, hlock, chain);
return 0;
+ }
}
#endif
return 1;
@@ -2099,15 +2176,37 @@ cache_hit:
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
+
+ BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+ BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
+ BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
- nr_chain_hlocks += chain->depth;
for (j = 0; j < chain->depth - 1; j++, i++) {
int lock_id = curr->held_locks[i].class_idx - 1;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
+
+ if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+ nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * Important for check_no_collision().
+ */
+ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ if (debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
+ }
+#endif
+
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -2855,6 +2954,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 2 * !!task->hardirq_context + !!task->softirq_context;
+}
+
static int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -2863,8 +2967,6 @@ static int separate_irq_context(struct task_struct *curr,
/*
* Keep track of points where we cross into an interrupt context:
*/
- hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
- curr->softirq_context;
if (depth) {
struct held_lock *prev_hlock;
@@ -2896,6 +2998,11 @@ static inline int mark_irqflags(struct task_struct *curr,
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
static inline int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -3164,6 +3271,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
+ hlock->irq_context = task_irq_context(curr);
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dbb61a302548..a0f61effad25 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v)
int i;
if (v == SEQ_START_TOKEN) {
+ if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb2a2c9bc3fc..d734b7502001 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -136,10 +136,12 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
}
if (counter == qstat_pv_hash_hops) {
- u64 frac;
+ u64 frac = 0;
- frac = 100ULL * do_div(stat, kicks);
- frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ if (kicks) {
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ }
/*
* Return a X.XX decimal number
diff --git a/kernel/resource.c b/kernel/resource.c
index 2e78ead30934..9b5f04404152 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8465eeab8b3..d1f7149f8704 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -321,6 +321,24 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -578,17 +596,8 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * FIFO realtime policy runs the highest priority task (after DEADLINE).
- * Other runnable tasks are of a lower priority. The scheduler tick
- * isn't needed.
- */
- fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
- if (fifo_nr_running)
- return true;
-
- /*
- * Round-robin realtime tasks time slice with other tasks at the same
- * realtime priority.
+ * If there are more than one RR tasks, we need the tick to effect the
+ * actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
@@ -597,8 +606,20 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
}
- /* Normal multitasking need periodic preemption checks */
- if (rq->cfs.nr_running > 1)
+ /*
+ * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+ * forced preemption between FIFO tasks.
+ */
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
+ return true;
+
+ /*
+ * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+ * if there's more than one we need the tick for involuntary
+ * preemption.
+ */
+ if (rq->nr_running > 1)
return false;
return true;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 084b79f5917e..58e3310c9b21 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -157,52 +157,50 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
-static unsigned long tick_dep_mask;
+static atomic_t tick_dep_mask;
-static void trace_tick_dependency(unsigned long dep)
+static bool check_tick_dependency(atomic_t *dep)
{
- if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+ int val = atomic_read(dep);
+
+ if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+ if (val & TICK_DEP_MASK_PERF_EVENTS) {
trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_SCHED) {
+ if (val & TICK_DEP_MASK_SCHED) {
trace_tick_stop(0, TICK_DEP_MASK_SCHED);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+ if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+ return true;
+ }
+
+ return false;
}
static bool can_stop_full_tick(struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
- if (tick_dep_mask) {
- trace_tick_dependency(tick_dep_mask);
+ if (check_tick_dependency(&tick_dep_mask))
return false;
- }
- if (ts->tick_dep_mask) {
- trace_tick_dependency(ts->tick_dep_mask);
+ if (check_tick_dependency(&ts->tick_dep_mask))
return false;
- }
- if (current->tick_dep_mask) {
- trace_tick_dependency(current->tick_dep_mask);
+ if (check_tick_dependency(&current->tick_dep_mask))
return false;
- }
- if (current->signal->tick_dep_mask) {
- trace_tick_dependency(current->signal->tick_dep_mask);
+ if (check_tick_dependency(&current->signal->tick_dep_mask))
return false;
- }
return true;
}
@@ -259,12 +257,12 @@ static void tick_nohz_full_kick_all(void)
preempt_enable();
}
-static void tick_nohz_dep_set_all(unsigned long *dep,
+static void tick_nohz_dep_set_all(atomic_t *dep,
enum tick_dep_bits bit)
{
- unsigned long prev;
+ int prev;
- prev = fetch_or(dep, BIT_MASK(bit));
+ prev = atomic_fetch_or(dep, BIT(bit));
if (!prev)
tick_nohz_full_kick_all();
}
@@ -280,7 +278,7 @@ void tick_nohz_dep_set(enum tick_dep_bits bit)
void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
- clear_bit(bit, &tick_dep_mask);
+ atomic_andnot(BIT(bit), &tick_dep_mask);
}
/*
@@ -289,12 +287,12 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
- unsigned long prev;
+ int prev;
struct tick_sched *ts;
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+ prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit));
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
@@ -313,7 +311,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- clear_bit(bit, &ts->tick_dep_mask);
+ atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
/*
@@ -331,7 +329,7 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- clear_bit(bit, &tsk->tick_dep_mask);
+ atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
/*
@@ -345,7 +343,7 @@ void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
- clear_bit(bit, &sig->tick_dep_mask);
+ atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}
/*
@@ -366,7 +364,8 @@ void __tick_nohz_task_switch(void)
ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped) {
- if (current->tick_dep_mask || current->signal->tick_dep_mask)
+ if (atomic_read(&current->tick_dep_mask) ||
+ atomic_read(&current->signal->tick_dep_mask))
tick_nohz_full_kick();
}
out:
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index eb4e32566a83..bf38226e5c17 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,7 +60,7 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
- unsigned long tick_dep_mask;
+ atomic_t tick_dep_mask;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..6f965864cc02 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2095,8 +2095,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("filter", 0644, file->dir, file,
&ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ /*
+ * Only event directories that can be enabled should have
+ * triggers.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2232ae3e3ad6..3bfdff06eea7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)