aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c5
-rw-r--r--kernel/bpf/btf.c1
-rw-r--r--kernel/bpf/cgroup.c13
-rw-r--r--kernel/bpf/core.c17
-rw-r--r--kernel/bpf/local_storage.c28
-rw-r--r--kernel/bpf/tnum.c9
-rw-r--r--kernel/bpf/trampoline.c64
-rw-r--r--kernel/bpf/verifier.c89
-rw-r--r--kernel/cgroup/cgroup.c11
-rw-r--r--kernel/cgroup/rstat.c2
-rw-r--r--kernel/cpu.c143
-rw-r--r--kernel/cred.c10
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/fork.c26
-rw-r--r--kernel/futex.c1
-rw-r--r--kernel/irq/cpuhotplug.c21
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c18
-rw-r--r--kernel/irq/manage.c45
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c8
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/mutex.c4
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/spinlock_debug.c32
-rw-r--r--kernel/nsproxy.c41
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/hibernate.c23
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/snapshot.c48
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c6
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rseq.c2
-rw-r--r--kernel/sched/cpufreq.c18
-rw-r--r--kernel/sched/cpufreq_schedutil.c8
-rw-r--r--kernel/sched/fair.c13
-rw-r--r--kernel/sched/isolation.c6
-rw-r--r--kernel/sched/psi.c5
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/smp.c99
-rw-r--r--kernel/taskstats.c30
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c121
-rw-r--r--kernel/time/hrtimer.c14
-rw-r--r--kernel/time/namespace.c468
-rw-r--r--kernel/time/posix-clock.c39
-rw-r--r--kernel/time/posix-cpu-timers.c32
-rw-r--r--kernel/time/posix-stubs.c18
-rw-r--r--kernel/time/posix-timers.c88
-rw-r--r--kernel/time/posix-timers.h7
-rw-r--r--kernel/time/sched_clock.c7
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c14
-rw-r--r--kernel/time/vsyscall.c37
-rw-r--r--kernel/trace/fgraph.c14
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace_events.c8
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c84
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_trigger.c20
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_probe.c8
-rw-r--r--kernel/trace/trace_probe.h9
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_uprobe.c121
-rw-r--r--kernel/trace/tracing_map.c4
-rw-r--r--kernel/up.c12
-rw-r--r--kernel/watchdog.c31
-rw-r--r--kernel/workqueue.c2
78 files changed, 1559 insertions, 587 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 8e09f0f55b4b..17b0d523afb3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -102,12 +102,13 @@ struct audit_net {
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the associated spinlock for writing.
*/
-static struct auditd_connection {
+struct auditd_connection {
struct pid *pid;
u32 portid;
struct net *net;
struct rcu_head rcu;
-} *auditd_conn = NULL;
+};
+static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7d40da240891..ed2075884724 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3470,6 +3470,7 @@ static u8 bpf_ctx_convert_map[] = {
[_id] = __ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
+ 0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9f90d3c92bda..9e43b72eb619 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp)
*/
static void cgroup_bpf_release(struct work_struct *work)
{
- struct cgroup *cgrp = container_of(work, struct cgroup,
- bpf.release_work);
+ struct cgroup *p, *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *old_array;
unsigned int type;
@@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_unlock(&cgroup_mutex);
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
cgroup_put(cgrp);
}
@@ -199,6 +202,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
struct bpf_prog_array *arrays[NR] = {};
+ struct cgroup *p;
int ret, i;
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
@@ -206,6 +210,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
if (ret)
return ret;
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_get(p);
+
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -1341,7 +1348,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct bpf_sysctl_kern, write,
- FIELD_SIZEOF(struct bpf_sysctl_kern,
+ sizeof_field(struct bpf_sysctl_kern,
write),
target_size));
break;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49e32acad7d8..af6b738cf435 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2043,23 +2043,28 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
for_each_cgroup_storage_type(stype) {
if (!aux->cgroup_storage[stype])
continue;
- bpf_cgroup_storage_release(aux->prog,
- aux->cgroup_storage[stype]);
+ bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
}
}
-static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+ struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
- int i;
+ u32 i;
bpf_free_cgroup_storage(aux);
- for (i = 0; i < aux->used_map_cnt; i++) {
- map = aux->used_maps[i];
+ for (i = 0; i < len; i++) {
+ map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
bpf_map_put(map);
}
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
kfree(aux->used_maps);
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2ba750725cb2..33d01866bcc2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map {
struct bpf_map map;
spinlock_t lock;
- struct bpf_prog *prog;
+ struct bpf_prog_aux *aux;
struct rb_root root;
struct list_head list;
};
@@ -357,7 +357,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
* The first field must be a 64 bit integer at 0 offset.
*/
m = (struct btf_member *)(key_type + 1);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
return -EINVAL;
@@ -366,7 +366,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
*/
m++;
offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
- size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
return -EINVAL;
@@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_seq_show_elem = cgroup_storage_seq_show_elem,
};
-int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
@@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
spin_lock_bh(&map->lock);
- if (map->prog && map->prog != prog)
+ if (map->aux && map->aux != aux)
goto unlock;
- if (prog->aux->cgroup_storage[stype] &&
- prog->aux->cgroup_storage[stype] != _map)
+ if (aux->cgroup_storage[stype] &&
+ aux->cgroup_storage[stype] != _map)
goto unlock;
- map->prog = prog;
- prog->aux->cgroup_storage[stype] = _map;
+ map->aux = aux;
+ aux->cgroup_storage[stype] = _map;
ret = 0;
unlock:
spin_unlock_bh(&map->lock);
@@ -443,16 +443,16 @@ unlock:
return ret;
}
-void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
{
enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
spin_lock_bh(&map->lock);
- if (map->prog == prog) {
- WARN_ON(prog->aux->cgroup_storage[stype] != _map);
- map->prog = NULL;
- prog->aux->cgroup_storage[stype] = NULL;
+ if (map->aux == aux) {
+ WARN_ON(aux->cgroup_storage[stype] != _map);
+ map->aux = NULL;
+ aux->cgroup_storage[stype] = NULL;
}
spin_unlock_bh(&map->lock);
}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index ca52b9642943..d4f335a9a899 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
-struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
/* if a.value is negative, arithmetic shifting by minimum shift
* will have larger negative offset compared to more shifting.
* If a.value is nonnegative, arithmetic shifting by minimum shift
* will have larger positive offset compare to more shifting.
*/
- return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
}
struct tnum tnum_add(struct tnum a, struct tnum b)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7e89f1f49d77..23b0d5cfd47e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -3,6 +3,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
#define TRAMPOLINE_HASH_BITS 10
@@ -59,6 +60,60 @@ out:
return tr;
}
+static int is_ftrace_location(void *ip)
+{
+ long addr;
+
+ addr = ftrace_location((long)ip);
+ if (!addr)
+ return 0;
+ if (WARN_ON_ONCE(addr != (long)ip))
+ return -EFAULT;
+ return 1;
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+ return ret;
+}
+
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ return ret;
+}
+
+/* first time registering */
+static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ int ret;
+
+ ret = is_ftrace_location(ip);
+ if (ret < 0)
+ return ret;
+ tr->func.ftrace_managed = ret;
+
+ if (tr->func.ftrace_managed)
+ ret = register_ftrace_direct((long)ip, (long)new_addr);
+ else
+ ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ return ret;
+}
+
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
* bytes on x86. Pick a number to fit into PAGE_SIZE / 2
*/
@@ -77,8 +132,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
int err;
if (fentry_cnt + fexit_cnt == 0) {
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
- old_image, NULL);
+ err = unregister_fentry(tr, old_image);
tr->selector = 0;
goto out;
}
@@ -105,12 +159,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
if (tr->selector)
/* progs already running at this address */
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
- old_image, new_image);
+ err = modify_fentry(tr, old_image, new_image);
else
/* first time registering */
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL,
- new_image);
+ err = register_fentry(tr, new_image);
if (err)
goto out;
tr->selector++;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 034ef81f935b..7d530ce8719d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,7 +907,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(struct bpf_reg_state *reg);
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
/* Mark the unknown part of a register (variable offset or scalar value) as
* known to have the value @imm.
@@ -945,7 +946,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
__mark_reg_known_zero(regs + regno);
@@ -1054,7 +1055,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
}
/* Mark a register as having a completely unknown (scalar) value. */
-static void __mark_reg_unknown(struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
/*
* Clear type, id, off, and union(map_ptr, range) and
@@ -1064,6 +1066,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
+ reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
+ true : false;
__mark_reg_unbounded(reg);
}
@@ -1074,19 +1078,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- regs += regno;
- __mark_reg_unknown(regs);
- /* constant backtracking is enabled for root without bpf2bpf calls */
- regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
- true : false;
+ __mark_reg_unknown(env, regs + regno);
}
-static void __mark_reg_not_init(struct bpf_reg_state *reg)
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
reg->type = NOT_INIT;
}
@@ -1097,10 +1098,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
return;
}
- __mark_reg_not_init(regs + regno);
+ __mark_reg_not_init(env, regs + regno);
}
#define DEF_NOT_SUBREG (0)
@@ -3234,7 +3235,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
- __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
state->stack[spi].slot_type[j] = STACK_MISC;
goto mark;
@@ -3892,7 +3893,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -3920,7 +3921,7 @@ static void release_reg_references(struct bpf_verifier_env *env,
if (!reg)
continue;
if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(reg);
+ __mark_reg_unknown(env, reg);
}
}
@@ -4134,6 +4135,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_map *map = meta->map_ptr;
struct tnum range;
u64 val;
+ int err;
if (func_id != BPF_FUNC_tail_call)
return 0;
@@ -4150,6 +4152,10 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
+ err = mark_chain_precision(env, BPF_REG_3);
+ if (err)
+ return err;
+
val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
@@ -4577,7 +4583,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -4829,13 +4835,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Taint dst register if offset had invalid bounds derived from
* e.g. dead branches.
*/
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
if (!src_known &&
opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
- __mark_reg_unknown(dst_reg);
+ __mark_reg_unknown(env, dst_reg);
return 0;
}
@@ -5043,9 +5049,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* Upon reaching here, src_known is true and
* umax_val is equal to umin_val.
*/
- dst_reg->smin_value >>= umin_val;
- dst_reg->smax_value >>= umin_val;
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+ if (insn_bitness == 32) {
+ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
+ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
+ } else {
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ }
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
+ insn_bitness);
/* blow away the dst_reg umin_value/umax_value and rely on
* dst_reg var_off to refine the result.
@@ -6258,6 +6271,7 @@ static bool may_access_skb(enum bpf_prog_type type)
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -6291,7 +6305,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(env, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
if (err)
return err;
@@ -6310,7 +6324,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
@@ -6323,6 +6337,10 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6977,7 +6995,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* since the register is unused, clear its state
* to make further comparison simpler
*/
- __mark_reg_not_init(&st->regs[i]);
+ __mark_reg_not_init(env, &st->regs[i]);
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
@@ -6985,7 +7003,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
/* liveness must not touch this stack slot anymore */
st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
if (!(live & REG_LIVE_READ)) {
- __mark_reg_not_init(&st->stack[i].spilled_ptr);
+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
}
@@ -8268,7 +8286,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog, map)) {
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
verbose(env, "only one cgroup storage of each type is allowed\n");
fdput(f);
return -EBUSY;
@@ -8298,18 +8316,8 @@ next_insn:
/* drop refcnt of maps used by the rejected program */
static void release_maps(struct bpf_verifier_env *env)
{
- enum bpf_cgroup_storage_type stype;
- int i;
-
- for_each_cgroup_storage_type(stype) {
- if (!env->prog->aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(env->prog,
- env->prog->aux->cgroup_storage[stype]);
- }
-
- for (i = 0; i < env->used_map_cnt; i++)
- bpf_map_put(env->used_maps[i]);
+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
+ env->used_map_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -9282,7 +9290,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (prog->jit_requested && !expect_blinding &&
+ if (env->allow_ptr_leaks && !expect_blinding &&
+ prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
!bpf_map_ptr_unpriv(aux)) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..1e12e6928bca 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3055,8 +3055,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
@@ -3066,6 +3064,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp)
return PTR_ERR(css);
}
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
@@ -3101,11 +3101,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
- WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
if (!css)
continue;
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
@@ -3392,7 +3392,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn, false);
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b48b22d4deb6..6f87352f8219 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -33,7 +33,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
return;
/*
- * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
+ * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we
* see NULL updated_next or they see our updated stat.
*/
smp_mb();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59cc980adad..4dc279ed3b2d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1909,6 +1909,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
EXPORT_SYMBOL(__cpuhp_remove_state);
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -2063,77 +2135,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static void cpuhp_offline_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = true;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
-}
-
-static void cpuhp_online_cpu_device(unsigned int cpu)
-{
- struct device *dev = get_cpu_device(cpu);
-
- dev->offline = false;
- /* Tell user space about the state change */
- kobject_uevent(&dev->kobj, KOBJ_ONLINE);
-}
-
-int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- for_each_online_cpu(cpu) {
- if (topology_is_primary_thread(cpu))
- continue;
- ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
- if (ret)
- break;
- /*
- * As this needs to hold the cpu maps lock it's impossible
- * to call device_offline() because that ends up calling
- * cpu_down() which takes cpu maps lock. cpu maps lock
- * needs to be held as this might race against in kernel
- * abusers of the hotplug machinery (thermal management).
- *
- * So nothing would update device:offline state. That would
- * leave the sysfs entry stale and prevent onlining after
- * smt control has been changed to 'off' again. This is
- * called under the sysfs hotplug lock, so it is properly
- * serialized against the regular offline usage.
- */
- cpuhp_offline_cpu_device(cpu);
- }
- if (!ret)
- cpu_smt_control = ctrlval;
- cpu_maps_update_done();
- return ret;
-}
-
-int cpuhp_smt_enable(void)
-{
- int cpu, ret = 0;
-
- cpu_maps_update_begin();
- cpu_smt_control = CPU_SMT_ENABLED;
- for_each_present_cpu(cpu) {
- /* Skip online CPUs and CPUs on offline nodes */
- if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
- continue;
- ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
- if (ret)
- break;
- /* See comment in cpuhp_smt_disable() */
- cpuhp_online_cpu_device(cpu);
- }
- cpu_maps_update_done();
- return ret;
-}
-
-
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
diff --git a/kernel/cred.c b/kernel/cred.c
index c0a4c12d38b2..809a985b1793 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk)
put_cred(cred);
#ifdef CONFIG_KEYS_REQUEST_CACHE
- key_put(current->cached_requested_key);
- current->cached_requested_key = NULL;
+ key_put(tsk->cached_requested_key);
+ tsk->cached_requested_key = NULL;
#endif
}
@@ -223,7 +223,7 @@ struct cred *cred_alloc_blank(void)
new->magic = CRED_MAGIC;
#endif
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -282,7 +282,7 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
@@ -715,7 +715,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4ff86d57f9e5..2173c23c25b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10523,7 +10523,7 @@ again:
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
goto err_locked;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
diff --git a/kernel/exit.c b/kernel/exit.c
index bcbd59888e67..2833ffb0c211 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -517,10 +517,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
- }
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -766,6 +762,14 @@ void __noreturn do_exit(long code)
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2508a4f238a3..ef82feb4bddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process(
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
+ struct nsproxy *nsp = current->nsproxy;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process(
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
- (task_active_pid_ns(current) !=
- current->nsproxy->pid_ns_for_children))
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * If the new process will be in a different time namespace
+ * do not allow it to share VM or a thread group with the forking task.
+ */
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
@@ -2578,6 +2587,16 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#endif
#ifdef __ARCH_WANT_SYS_CLONE3
+
+/*
+ * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
+ * the registers containing the syscall arguments for clone. This doesn't work
+ * with clone3 since the TLS value is passed in clone_args instead.
+ */
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+#error clone3 requires copy_thread_tls support in arch
+#endif
+
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -2811,7 +2830,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWTIME))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..0cf84c8664f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1178,6 +1178,7 @@ out_error:
/**
* wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
* @exiting: Pointer to the exiting task
*
* Caller must hold a refcount on @exiting.
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c7ca2e983a5..02236b13b359 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -12,6 +12,7 @@
#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/irq.h>
+#include <linux/sched/isolation.h>
#include "internals.h"
@@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void)
}
}
+static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
+{
+ const struct cpumask *hk_mask;
+
+ if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ return false;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
+ return false;
+
+ return cpumask_test_cpu(cpu, hk_mask);
+}
+
static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
@@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
/*
* If the interrupt can only be directed to a single target
* CPU then it is already assigned to a CPU in the affinity
- * mask. No point in trying to move it around.
+ * mask. No point in trying to move it around unless the
+ * isolation mechanism requests to move it to an upcoming
+ * housekeeping CPU.
*/
- if (!irqd_is_single_target(data))
+ if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
irq_set_affinity_locked(data, affinity, false);
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 5b8fdd659e54..98a5f10d1900 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -891,6 +891,7 @@ __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
}
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+ __releases(&desc->lock)
{
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (bus)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dd822fd8a7d5..7527e5ef6fe5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -987,6 +987,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
/**
+ * irq_domain_translate_onecell() - Generic translate for direct one cell
+ * bindings
+ */
+int irq_domain_translate_onecell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 1))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
+
+/**
* irq_domain_translate_twocell() - Generic translate for direct two cell
* bindings
*
@@ -1459,6 +1476,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (rv) {
/* Restore the original irq_data. */
*root_irq_data = *child_irq_data;
+ kfree(child_irq_data);
goto error;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1753486b440c..818b2802d3e7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
@@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- ret = chip->irq_set_affinity(data, mask, force);
+ /*
+ * If this is a managed interrupt and housekeeping is enabled on
+ * it check whether the requested affinity mask intersects with
+ * a housekeeping CPU. If so, then remove the isolated CPUs from
+ * the mask and just keep the housekeeping CPU(s). This prevents
+ * the affinity setter from routing the interrupt to an isolated
+ * CPU to avoid that I/O submitted from a housekeeping CPU causes
+ * interrupts on an isolated one.
+ *
+ * If the masks do not intersect or include online CPU(s) then
+ * keep the requested mask. The isolated target CPUs are only
+ * receiving interrupts when the I/O operation was submitted
+ * directly from them.
+ *
+ * If all housekeeping CPUs in the affinity mask are offline, the
+ * interrupt will be migrated by the CPU hotplug code once a
+ * housekeeping CPU which belongs to the affinity mask comes
+ * online.
+ */
+ if (irqd_affinity_is_managed(data) &&
+ housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask, *prog_mask;
+
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+
+ raw_spin_lock(&tmp_mask_lock);
+ cpumask_and(&tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
+ prog_mask = mask;
+ else
+ prog_mask = &tmp_mask;
+ ret = chip->irq_set_affinity(data, prog_mask, force);
+ raw_spin_unlock(&tmp_mask_lock);
+ } else {
+ ret = chip->irq_set_affinity(data, mask, force);
+ }
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -1500,8 +1539,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
- pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
- irq);
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
+ new->name, irq);
ret = -EINVAL;
goto out_unlock;
}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2ed97a7c9b2a..f865e5f4d382 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -34,6 +34,7 @@ static atomic_t irq_poll_active;
* true and let the handler run.
*/
bool irq_wait_for_poll(struct irq_desc *desc)
+ __must_hold(&desc->lock)
{
if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
"irq poll in progress on cpu %d for irq %d\n",
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bc933c0db9bf..f977786fe498 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -159,6 +159,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/* Install the new kernel and uninstall the old */
image = xchg(dest_image, image);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 15d70a90b50d..c19c0dad1ebe 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -589,6 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->unusable_pages);
}
+
+int __weak machine_kexec_post_load(struct kimage *image)
+{
+ return 0;
+}
+
void kimage_terminate(struct kimage *image)
{
if (*image->entry != 0)
@@ -1171,7 +1177,7 @@ int kernel_kexec(void)
* CPU hotplug again; so re-enable it here.
*/
cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
+ pr_notice("Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index a2df93948665..faa74d5f6941 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -441,6 +441,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
kimage_terminate(image);
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
+
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 48aaf2ac0d0d..39d30ccf8d87 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,6 +13,8 @@ void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
+int machine_kexec_post_load(struct kimage *image);
+
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 32282e7112d3..32406ef0d6a2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void)
struct lock_trace *trace, *t2;
struct hlist_head *hash_head;
u32 hash;
- unsigned int max_entries;
+ int max_entries;
BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
@@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void)
trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
LOCK_TRACE_SIZE_IN_LONGS;
- trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES -
- LOCK_TRACE_SIZE_IN_LONGS - 1) {
+ if (max_entries <= 0) {
if (!debug_locks_off_graph_unlock())
return NULL;
@@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void)
return NULL;
}
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
hash = jhash(trace->entries, trace->nr_entries *
sizeof(trace->entries[0]), 0);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 54cc5f9286e9..5352ce50a97e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -733,9 +733,6 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
void __sched mutex_unlock(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_MUTEXES
- WARN_ON(in_interrupt());
-#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
if (__mutex_unlock_fast(lock))
return;
@@ -1416,7 +1413,6 @@ int __sched mutex_trylock(struct mutex *lock)
#ifdef CONFIG_DEBUG_MUTEXES
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- WARN_ON(in_interrupt());
#endif
locked = __mutex_trylock(lock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44e68761f432..0d9b6be9ecc8 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1226,8 +1226,8 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if ((wstate == WRITER_HANDOFF) &&
- (rwsem_spin_on_owner(sem, 0) == OWNER_NULL))
+ if (wstate == WRITER_HANDOFF &&
+ rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
goto trylock_again;
/* Block until there are no active lockers. */
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 399669f7eba8..472dd462a40c 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -51,19 +51,19 @@ EXPORT_SYMBOL(__rwlock_init);
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +80,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,8 +99,8 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
/*
@@ -187,8 +187,8 @@ static inline void debug_write_lock_before(rwlock_t *lock)
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -197,8 +197,8 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
void do_raw_write_lock(rwlock_t *lock)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c815f58e6bc0..ed9882108cd2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#ifdef CONFIG_TIME_NS
+ .time_ns = &init_time_ns,
+ .time_ns_for_children = &init_time_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_net;
}
+ new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+ tsk->nsproxy->time_ns_for_children);
+ if (IS_ERR(new_nsp->time_ns_for_children)) {
+ err = PTR_ERR(new_nsp->time_ns_for_children);
+ goto out_time;
+ }
+ new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
return new_nsp;
+out_time:
+ put_net(new_nsp->net_ns);
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
@@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
+ int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
- get_nsproxy(old_ns);
- return 0;
- }
-
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+ } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
+ ret = timens_on_fork(new_ns, tsk);
+ if (ret) {
+ free_nsproxy(new_ns);
+ return ret;
+ }
+
tsk->nsproxy = new_ns;
return 0;
}
@@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->time_ns)
+ put_time_ns(ns->time_ns);
+ if (ns->time_ns_for_children)
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
@@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d3667b4075c1..7cbfbeacd68a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,7 +27,10 @@ config SUSPEND_SKIP_SYNC
Skip the kernel sys_sync() before freezing user processes.
Some systems prefer not to pay this cost on every invocation
of suspend, or they are content with invoking sync() from
- user-space before invoking suspend. Say Y if that's your case.
+ user-space before invoking suspend. There's a run-time switch
+ at '/sys/power/sync_on_suspend' to configure this behaviour.
+ This setting changes the default for the run-tim switch. Say Y
+ to change the default to disable the kernel sys_sync().
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3c0a5a8170b0..6dbeedb7354c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,7 +9,7 @@
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/export.h>
#include <linux/suspend.h>
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
- pr_info("hibernation debug: Waiting for 5 seconds.\n");
+ pr_info("debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- pr_err("Some devices failed to power down, aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting\n");
return error;
}
@@ -295,7 +295,7 @@ static int create_image(int platform_mode)
error = syscore_suspend();
if (error) {
- pr_err("Some system devices failed to power down, aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting\n");
goto Enable_irqs;
}
@@ -310,7 +310,7 @@ static int create_image(int platform_mode)
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
- pr_err("Error %d creating hibernation image\n", error);
+ pr_err("Error %d creating image\n", error);
if (!in_suspend) {
events_check_enabled = false;
@@ -680,7 +680,7 @@ static int load_image_and_restore(void)
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
- pr_err("Failed to load hibernation image, recovering.\n");
+ pr_err("Failed to load image, recovering.\n");
swsusp_free();
free_basic_memory_bitmaps();
Unlock:
@@ -743,7 +743,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pm_pr_dbg("Writing image.\n");
+ pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -755,7 +755,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pm_pr_dbg("Image restored successfully.\n");
+ pm_pr_dbg("Hibernation image restored successfully.\n");
}
Free_bitmaps:
@@ -894,7 +894,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pm_pr_dbg("Preparing processes for restore.\n");
+ pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -903,7 +903,7 @@ static int software_resume(void)
Finish:
__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
- pr_info("resume from hibernation failed (%d)\n", error);
+ pr_info("resume failed (%d)\n", error);
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
Unlock:
@@ -1068,7 +1068,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device);
+ pm_pr_dbg("Configured hibernation resume from disk to %u\n",
+ swsusp_resume_device);
noresume = 0;
software_resume();
return n;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index e26de7af520b..69b7a8aeca3b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -190,6 +190,38 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
}
power_attr(mem_sleep);
+
+/*
+ * sync_on_suspend: invoke ksys_sync_helper() before suspend.
+ *
+ * show() returns whether ksys_sync_helper() is invoked before suspend.
+ * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it.
+ */
+bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
+
+static ssize_t sync_on_suspend_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", sync_on_suspend_enabled);
+}
+
+static ssize_t sync_on_suspend_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ sync_on_suspend_enabled = !!val;
+ return n;
+}
+
+power_attr(sync_on_suspend);
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -855,6 +887,7 @@ static struct attribute * g[] = {
&wakeup_count_attr.attr,
#ifdef CONFIG_SUSPEND
&mem_sleep_attr.attr,
+ &sync_on_suspend_attr.attr,
#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 26b9168321e7..ddade80ad276 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -8,7 +8,7 @@
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*/
-#define pr_fmt(fmt) "PM: " fmt
+#define pr_fmt(fmt) "PM: hibernation: " fmt
#include <linux/version.h>
#include <linux/module.h>
@@ -1147,24 +1147,24 @@ void free_basic_memory_bitmaps(void)
void clear_free_pages(void)
{
-#ifdef CONFIG_PAGE_POISONING_ZERO
struct memory_bitmap *bm = free_pages_map;
unsigned long pfn;
if (WARN_ON(!(free_pages_map)))
return;
- memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm);
- while (pfn != BM_END_OF_MAP) {
- if (pfn_valid(pfn))
- clear_highpage(pfn_to_page(pfn));
-
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+ memory_bm_position_reset(bm);
pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_highpage(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("free pages cleared after restore\n");
}
- memory_bm_position_reset(bm);
- pr_info("free pages cleared after restore\n");
-#endif /* PAGE_POISONING_ZERO */
}
/**
@@ -1566,9 +1566,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
- x *= multiplier;
- do_div(x, base);
- return (unsigned long)x;
+ return div64_u64(x * multiplier, base);
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
@@ -1705,16 +1703,20 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- pr_info("Preallocating image memory... ");
+ pr_info("Preallocating image memory\n");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate original bitmap\n");
goto err_out;
+ }
error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate copy bitmap\n");
goto err_out;
+ }
alloc_normal = 0;
alloc_highmem = 0;
@@ -1804,8 +1806,11 @@ int hibernate_preallocate_memory(void)
alloc -= pages;
pages += pages_highmem;
pages_highmem = preallocate_image_highmem(alloc);
- if (pages_highmem < alloc)
+ if (pages_highmem < alloc) {
+ pr_err("Image allocation is %lu pages short\n",
+ alloc - pages_highmem);
goto err_out;
+ }
pages += pages_highmem;
/*
* size is the desired number of saveable pages to leave in
@@ -1836,13 +1841,12 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- pr_cont("done (allocated %lu pages)\n", pages);
+ pr_info("Allocated %lu pages for snapshot\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- pr_cont("\n");
swsusp_free();
return -ENOMEM;
}
@@ -1976,7 +1980,7 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- pr_info("Creating hibernation image:\n");
+ pr_info("Creating image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
@@ -2010,7 +2014,7 @@ asmlinkage __visible int swsusp_save(void)
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
+ pr_info("Image created (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f3b7239f1892..2c47280fbfc7 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -564,7 +564,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_TO_IDLE)
s2idle_begin();
- if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
+ if (sync_on_suspend_enabled) {
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 60564b58de07..e1ed58adb69e 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -70,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
- unsigned long now;
+ time64_t now;
struct rtc_wkalrm alm;
int status;
@@ -81,10 +81,10 @@ repeat:
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
- rtc_tm_to_time(&alm.time, &now);
+ now = rtc_tm_to_time64(&alm.time);
memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cb9ddcc08119..43d6179508d6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
+ unsigned int mode)
{
+ int ret;
+
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
+
+ return ret == 0;
}
/* Returns 0 on success, -errno on denial. */
@@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(tcred->user_ns, mode))
+ if (ptrace_has_cap(cred, tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -340,7 +345,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(mm->user_ns, mode)))
+ !ptrace_has_cap(cred, mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 27c48eb7de40..a4f86a9d6937 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int ret;
if (flags & RSEQ_FLAG_UNREGISTER) {
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index b5dcd1d83c7f..7c2fe50fd76d 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <linux/cpufreq.h>
+
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
@@ -57,3 +59,19 @@ void cpufreq_remove_update_util_hook(int cpu)
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 322ca8860f54..9b8916fd00a2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -82,12 +82,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
- * For the slow switching platforms, the kthread is always scheduled on
- * the right set of CPUs and any CPU can find the next frequency and
- * schedule the kthread.
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
*/
- if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_this_cpu_can_update(sg_policy->policy))
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08a233e97a01..ba749f579714 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7328,7 +7328,14 @@ static int detach_tasks(struct lb_env *env)
load < 16 && !env->sd->nr_balance_failed)
goto next;
- if (load/2 > env->imbalance)
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (load/2 > env->imbalance &&
+ env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
goto next;
env->imbalance -= load;
@@ -8417,6 +8424,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (!idlest)
return NULL;
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 9fcb2a695a41..008d6ac2342b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -163,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
continue;
}
+ if (!strncmp(str, "managed_irq,", 12)) {
+ str += 12;
+ flags |= HK_FLAG_MANAGED_IRQ;
+ continue;
+ }
+
pr_warn("isolcpus: Error, unknown flag\n");
return 0;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 517e3719027e..ce8f6748678a 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -185,7 +185,8 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->avg_next_update = sched_clock() + psi_period;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
@@ -481,7 +482,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
u32 remaining;
remaining = win->size - elapsed;
- growth += div_u64(win->prev_growth * remaining, win->size);
+ growth += div64_u64(win->prev_growth * remaining, win->size);
}
return growth;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 12d2227e5786..b6ea3dcb57bf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1026,6 +1026,13 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
struct seccomp_notif unotif;
ssize_t ret;
+ /* Verify that we're not given garbage to keep struct extensible. */
+ ret = check_zeroed_user(buf, sizeof(unotif));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return -EINVAL;
+
memset(&unotif, 0, sizeof(unotif));
ret = down_interruptible(&filter->notif->request);
diff --git a/kernel/smp.c b/kernel/smp.c
index 7dbcb402c2fc..3b7bedc97af3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -395,22 +395,9 @@ call:
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
-/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. Preemption
- * must be disabled when calling this function.
- */
-void smp_call_function_many(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
+static void smp_call_function_many_cond(const struct cpumask *mask,
+ smp_call_func_t func, void *info,
+ bool wait, smp_cond_func_t cond_func)
{
struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
@@ -448,7 +435,8 @@ void smp_call_function_many(const struct cpumask *mask,
/* Fastpath: do that cpu by itself. */
if (next_cpu >= nr_cpu_ids) {
- smp_call_function_single(cpu, func, info, wait);
+ if (!cond_func || (cond_func && cond_func(cpu, info)))
+ smp_call_function_single(cpu, func, info, wait);
return;
}
@@ -465,6 +453,9 @@ void smp_call_function_many(const struct cpumask *mask,
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
+ if (cond_func && !cond_func(cpu, info))
+ continue;
+
csd_lock(csd);
if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS;
@@ -486,6 +477,26 @@ void smp_call_function_many(const struct cpumask *mask,
}
}
}
+
+/**
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+void smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ smp_call_function_many_cond(mask, func, info, wait, NULL);
+}
EXPORT_SYMBOL(smp_call_function_many);
/**
@@ -668,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
- * @gfp_flags: GFP flags to use when allocating the cpumask
- * used internally by the function.
- *
- * The function might sleep if the GFP flags indicates a non
- * atomic allocation is allowed.
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
@@ -680,46 +686,27 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
- cpumask_var_t cpus;
- int cpu, ret;
-
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
-
- if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info))
- __cpumask_set_cpu(cpu, cpus);
- on_each_cpu_mask(cpus, func, info, wait);
- preempt_enable();
- free_cpumask_var(cpus);
- } else {
- /*
- * No free cpumask, bother. No matter, we'll
- * just have to IPI them one by one.
- */
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info)) {
- ret = smp_call_function_single(cpu, func,
- info, wait);
- WARN_ON_ONCE(ret);
- }
- preempt_enable();
+ int cpu = get_cpu();
+
+ smp_call_function_many_cond(mask, func, info, wait, cond_func);
+ if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
}
+ put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
- cpu_online_mask);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13a0f2e6ebc2..e2ac0e37c4ae 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -554,25 +554,33 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
- struct taskstats *stats;
+ struct taskstats *stats_new, *stats;
- if (sig->stats || thread_group_empty(tsk))
- goto ret;
+ /* Pairs with smp_store_release() below. */
+ stats = smp_load_acquire(&sig->stats);
+ if (stats || thread_group_empty(tsk))
+ return stats;
/* No problem if kmem_cache_zalloc() fails */
- stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
spin_lock_irq(&tsk->sighand->siglock);
- if (!sig->stats) {
- sig->stats = stats;
- stats = NULL;
+ stats = sig->stats;
+ if (!stats) {
+ /*
+ * Pairs with smp_store_release() above and order the
+ * kmem_cache_zalloc().
+ */
+ smp_store_release(&sig->stats, stats_new);
+ stats = stats_new;
+ stats_new = NULL;
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (stats)
- kmem_cache_free(taskstats_cache, stats);
-ret:
- return sig->stats;
+ if (stats_new)
+ kmem_cache_free(taskstats_cache, stats_new);
+
+ return stats;
}
/* Send pid data out on exit */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1867044800bb..c8f00168afe8 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 451f9d05ccfe..2ffb466af77e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/module.h>
+#include <linux/time_namespace.h>
#include "posix-timers.h"
@@ -36,13 +37,15 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @gettime: Function to read the time correlating to the base
+ * @get_ktime: Function to read the time correlating to the base
+ * @get_timespec: Function to read the namespace time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- ktime_t (*gettime)(void);
+ ktime_t (*get_ktime)(void);
+ void (*get_timespec)(struct timespec64 *tp);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -55,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
#ifdef CONFIG_RTC_CLASS
-static struct wakeup_source *ws;
-
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
@@ -66,8 +67,6 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* alarmtimer_get_rtcdev - Return selected rtcdevice
*
* This function returns the rtc device to use for wakealarms.
- * If one has not already been chosen, it checks to see if a
- * functional rtc device is available.
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
@@ -87,7 +86,8 @@ static int alarmtimer_rtc_add_device(struct device *dev,
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
- struct wakeup_source *__ws;
+ struct platform_device *pdev;
+ int ret = 0;
if (rtcdev)
return -EBUSY;
@@ -97,26 +97,31 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
- __ws = wakeup_source_register(dev, "alarmtimer");
+ pdev = platform_device_register_data(dev, "alarmtimer",
+ PLATFORM_DEVID_AUTO, NULL, 0);
+ if (!IS_ERR(pdev))
+ device_init_wakeup(&pdev->dev, true);
spin_lock_irqsave(&rtcdev_lock, flags);
- if (!rtcdev) {
+ if (!IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
- spin_unlock_irqrestore(&rtcdev_lock, flags);
- return -1;
+ ret = -1;
+ goto unlock;
}
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
- ws = __ws;
- __ws = NULL;
+ pdev = NULL;
+ } else {
+ ret = -1;
}
+unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
- wakeup_source_unregister(__ws);
+ platform_device_unregister(pdev);
- return 0;
+ return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
@@ -138,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-struct rtc_device *alarmtimer_get_rtcdev(void)
-{
- return NULL;
-}
-#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
static inline void alarmtimer_rtc_timer_init(void) { }
@@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
spin_unlock_irqrestore(&base->lock, flags);
if (alarm->function)
- restart = alarm->function(alarm, base->gettime());
+ restart = alarm->function(alarm, base->get_ktime());
spin_lock_irqsave(&base->lock, flags);
if (restart != ALARMTIMER_NORESTART) {
@@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_fired(alarm, base->gettime());
+ trace_alarmtimer_fired(alarm, base->get_ktime());
return ret;
}
@@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
ktime_t alarm_expires_remaining(const struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return ktime_sub(alarm->node.expires, base->gettime());
+ return ktime_sub(alarm->node.expires, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_expires_remaining);
@@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev)
spin_unlock_irqrestore(&base->lock, flags);
if (!next)
continue;
- delta = ktime_sub(next->expires, base->gettime());
+ delta = ktime_sub(next->expires, base->get_ktime());
if (!min || (delta < min)) {
expires = next->expires;
min = delta;
@@ -281,7 +281,7 @@ static int alarmtimer_suspend(struct device *dev)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
- __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
return -EBUSY;
}
@@ -296,7 +296,7 @@ static int alarmtimer_suspend(struct device *dev)
/* Set alarm, if in the past reject suspend briefly to handle */
ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
- __pm_wakeup_event(ws, MSEC_PER_SEC);
+ pm_wakeup_event(dev, MSEC_PER_SEC);
return ret;
}
@@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start)
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_start(alarm, base->gettime());
+ trace_alarmtimer_start(alarm, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add_safe(start, base->gettime());
+ start = ktime_add_safe(start, base->get_ktime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_cancel(alarm, base->gettime());
+ trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
@@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return alarm_forward(alarm, base->gettime(), interval);
+ return alarm_forward(alarm, base->get_ktime(), interval);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
return;
}
- delta = ktime_sub(absexp, base->gettime());
+ delta = ktime_sub(absexp, base->get_ktime());
spin_lock_irqsave(&freezer_delta_lock, flags);
if (!freezer_delta || (delta < freezer_delta)) {
@@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
struct alarm_base *base = &alarm_bases[alarm->type];
if (!absolute)
- expires = ktime_add_safe(expires, base->gettime());
+ expires = ktime_add_safe(expires, base->get_ktime());
if (sigev_none)
alarm->node.expires = expires;
else
@@ -657,24 +657,41 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
}
/**
- * alarm_clock_get - posix clock_get interface
+ * alarm_clock_get_timespec - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
- * Provides the underlying alarm base time.
+ * Provides the underlying alarm base time in a tasks time namespace.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
+static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec64(base->gettime());
+ base->get_timespec(tp);
+
return 0;
}
/**
+ * alarm_clock_get_ktime - posix clock_get_ktime interface
+ * @which_clock: clockid
+ *
+ * Provides the underlying alarm base time in the root namespace.
+ */
+static ktime_t alarm_clock_get_ktime(clockid_t which_clock)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ return base->get_ktime();
+}
+
+/**
* alarm_timer_create - posix timer_create interface
* @new_timer: k_itimer pointer to manage
*
@@ -747,7 +764,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
struct timespec64 rmt;
ktime_t rem;
- rem = ktime_sub(absexp, alarm_bases[type].gettime());
+ rem = ktime_sub(absexp, alarm_bases[type].get_ktime());
if (rem <= 0)
return 0;
@@ -816,9 +833,11 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
- ktime_t now = alarm_bases[type].gettime();
+ ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
+ } else {
+ exp = timens_ktime_to_host(which_clock, exp);
}
ret = alarmtimer_do_nsleep(&alarm, exp, type);
@@ -837,7 +856,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
- .clock_get = alarm_clock_get,
+ .clock_get_ktime = alarm_clock_get_ktime,
+ .clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
@@ -866,6 +886,12 @@ static struct platform_driver alarmtimer_driver = {
}
};
+static void get_boottime_timespec(struct timespec64 *tp)
+{
+ ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
+}
+
/**
* alarmtimer_init - Initialize alarm timer code
*
@@ -874,17 +900,18 @@ static struct platform_driver alarmtimer_driver = {
*/
static int __init alarmtimer_init(void)
{
- struct platform_device *pdev;
- int error = 0;
+ int error;
int i;
alarmtimer_rtc_timer_init();
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
- alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64,
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
- alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
@@ -898,15 +925,7 @@ static int __init alarmtimer_init(void)
if (error)
goto out_if;
- pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
- if (IS_ERR(pdev)) {
- error = PTR_ERR(pdev);
- goto out_drv;
- }
return 0;
-
-out_drv:
- platform_driver_unregister(&alarmtimer_driver);
out_if:
alarmtimer_rtc_interface_remove();
return error;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8de90ea31280..3a609e7344f3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
- unsigned long flags)
+ unsigned long flags) __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
return ret;
}
-long hrtimer_nanosleep(const struct timespec64 *rqtp,
- const enum hrtimer_mode mode, const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
+ const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
@@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
@@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
+ CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
new file mode 100644
index 000000000000..12858507d75a
--- /dev/null
+++ b/kernel/time/namespace.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/time_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/seq_file.h>
+#include <linux/proc_ns.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+
+#include <vdso/datapage.h>
+
+ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
+ struct timens_offsets *ns_offsets)
+{
+ ktime_t offset;
+
+ switch (clockid) {
+ case CLOCK_MONOTONIC:
+ offset = timespec64_to_ktime(ns_offsets->monotonic);
+ break;
+ case CLOCK_BOOTTIME:
+ case CLOCK_BOOTTIME_ALARM:
+ offset = timespec64_to_ktime(ns_offsets->boottime);
+ break;
+ default:
+ return tim;
+ }
+
+ /*
+ * Check that @tim value is in [offset, KTIME_MAX + offset]
+ * and subtract offset.
+ */
+ if (tim < offset) {
+ /*
+ * User can specify @tim *absolute* value - if it's lesser than
+ * the time namespace's offset - it's already expired.
+ */
+ tim = 0;
+ } else {
+ tim = ktime_sub(tim, offset);
+ if (unlikely(tim > KTIME_MAX))
+ tim = KTIME_MAX;
+ }
+
+ return tim;
+}
+
+static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
+}
+
+static void dec_time_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
+}
+
+/**
+ * clone_time_ns - Clone a time namespace
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * Clone @old_ns and set the clone refcount to 1
+ *
+ * Return: The new namespace or ERR_PTR.
+ */
+static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
+ struct time_namespace *old_ns)
+{
+ struct time_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ err = -ENOSPC;
+ ucounts = inc_time_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns)
+ goto fail_dec;
+
+ kref_init(&ns->kref);
+
+ ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!ns->vvar_page)
+ goto fail_free;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto fail_free_page;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &timens_operations;
+ ns->user_ns = get_user_ns(user_ns);
+ ns->offsets = old_ns->offsets;
+ ns->frozen_offsets = false;
+ return ns;
+
+fail_free_page:
+ __free_page(ns->vvar_page);
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_time_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
+}
+
+/**
+ * copy_time_ns - Create timens_for_children from @old_ns
+ * @flags: Cloning flags
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
+ * adds a refcounter to @old_ns otherwise.
+ *
+ * Return: timens_for_children namespace or ERR_PTR.
+ */
+struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWTIME))
+ return get_time_ns(old_ns);
+
+ return clone_time_ns(user_ns, old_ns);
+}
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_data->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces
+ * the time namespace handling path.
+ */
+static void timens_setup_vdso_data(struct vdso_data *vdata,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vdata->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vdata->seq = 1;
+ vdata->clock_mode = VCLOCK_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+static DEFINE_MUTEX(offset_lock);
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_data *vdata;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ mutex_lock(&offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ goto out;
+
+ ns->frozen_offsets = true;
+ vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_data(&vdata[i], ns);
+
+out:
+ mutex_unlock(&offset_lock);
+}
+
+void free_time_ns(struct kref *kref)
+{
+ struct time_namespace *ns;
+
+ ns = container_of(kref, struct time_namespace, kref);
+ dec_time_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ __free_page(ns->vvar_page);
+ kfree(ns);
+}
+
+static struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct time_namespace, ns);
+}
+
+static struct ns_common *timens_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static struct ns_common *timens_for_children_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void timens_put(struct ns_common *ns)
+{
+ put_time_ns(to_time_ns(ns));
+}
+
+static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct time_namespace *ns = to_time_ns(new);
+ int err;
+
+ if (!current_is_single_threaded())
+ return -EUSERS;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ timens_set_vvar_page(current, ns);
+
+ err = vdso_join_timens(current, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns_for_children);
+ nsproxy->time_ns_for_children = ns;
+ return 0;
+}
+
+int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+ struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
+ struct time_namespace *ns = to_time_ns(nsc);
+ int err;
+
+ /* create_new_namespaces() already incremented the ref counter */
+ if (nsproxy->time_ns == nsproxy->time_ns_for_children)
+ return 0;
+
+ timens_set_vvar_page(tsk, ns);
+
+ err = vdso_join_timens(tsk, ns);
+ if (err)
+ return err;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ return 0;
+}
+
+static struct user_namespace *timens_owner(struct ns_common *ns)
+{
+ return to_time_ns(ns)->user_ns;
+}
+
+static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
+{
+ seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec);
+}
+
+void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return;
+ time_ns = to_time_ns(ns);
+
+ show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
+ show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
+ put_time_ns(time_ns);
+}
+
+int proc_timens_set_offset(struct file *file, struct task_struct *p,
+ struct proc_timens_offset *offsets, int noffsets)
+{
+ struct ns_common *ns;
+ struct time_namespace *time_ns;
+ struct timespec64 tp;
+ int i, err;
+
+ ns = timens_for_children_get(p);
+ if (!ns)
+ return -ESRCH;
+ time_ns = to_time_ns(ns);
+
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
+ put_time_ns(time_ns);
+ return -EPERM;
+ }
+
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(&tp);
+ break;
+ case CLOCK_BOOTTIME:
+ ktime_get_boottime_ts64(&tp);
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ERANGE;
+
+ if (off->val.tv_sec > KTIME_SEC_MAX ||
+ off->val.tv_sec < -KTIME_SEC_MAX)
+ goto out;
+
+ tp = timespec64_add(tp, off->val);
+ /*
+ * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
+ * still unreachable.
+ */
+ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
+ goto out;
+ }
+
+ mutex_lock(&offset_lock);
+ if (time_ns->frozen_offsets) {
+ err = -EACCES;
+ goto out_unlock;
+ }
+
+ err = 0;
+ /* Don't report errors after this line */
+ for (i = 0; i < noffsets; i++) {
+ struct proc_timens_offset *off = &offsets[i];
+ struct timespec64 *offset = NULL;
+
+ switch (off->clockid) {
+ case CLOCK_MONOTONIC:
+ offset = &time_ns->offsets.monotonic;
+ break;
+ case CLOCK_BOOTTIME:
+ offset = &time_ns->offsets.boottime;
+ break;
+ }
+
+ *offset = off->val;
+ }
+
+out_unlock:
+ mutex_unlock(&offset_lock);
+out:
+ put_time_ns(time_ns);
+
+ return err;
+}
+
+const struct proc_ns_operations timens_operations = {
+ .name = "time",
+ .type = CLONE_NEWTIME,
+ .get = timens_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+const struct proc_ns_operations timens_for_children_operations = {
+ .name = "time_for_children",
+ .type = CLONE_NEWTIME,
+ .get = timens_for_children_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+struct time_namespace init_time_ns = {
+ .kref = KREF_INIT(3),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_TIME_INIT_INO,
+ .ns.ops = &timens_operations,
+ .frozen_offsets = true,
+};
+
+static int __init time_ns_init(void)
+{
+ return 0;
+}
+subsys_initcall(time_ns_init);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..77c0c2370b6d 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -14,8 +14,6 @@
#include "posix-timers.h"
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
@@ -315,8 +310,8 @@ out:
}
const struct k_clock clock_posix_dynamic = {
- .clock_getres = pc_clock_getres,
- .clock_set = pc_clock_settime,
- .clock_get = pc_clock_gettime,
- .clock_adj = pc_clock_adjtime,
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get_timespec = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 42d512fcfda2..8ff6da77a01f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
}
const struct k_clock clock_posix_cpu = {
- .clock_getres = posix_cpu_clock_getres,
- .clock_set = posix_cpu_clock_set,
- .clock_get = posix_cpu_clock_get,
- .timer_create = posix_cpu_timer_create,
- .nsleep = posix_cpu_nsleep,
- .timer_set = posix_cpu_timer_set,
- .timer_del = posix_cpu_timer_del,
- .timer_get = posix_cpu_timer_get,
- .timer_rearm = posix_cpu_timer_rearm,
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get_timespec = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+ .timer_rearm = posix_cpu_timer_rearm,
};
const struct k_clock clock_process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get_timespec = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
};
const struct k_clock clock_thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .timer_create = thread_cpu_timer_create,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get_timespec = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 67df65f887ac..fcb3b21d8bdc 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include <linux/timekeeping.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
#include <linux/compat.h>
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
break;
case CLOCK_MONOTONIC:
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
break;
case CLOCK_BOOTTIME:
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
break;
default:
return -EINVAL;
@@ -126,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -144,13 +148,19 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
+#endif
+
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
@@ -212,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -230,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0ec5b7a1d769..ff0eb30de346 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
+#include <linux/time_namespace.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -165,12 +166,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
}
+static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
+{
+ return ktime_get_real();
+}
+
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
@@ -187,18 +193,25 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
+static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
+{
+ return ktime_get();
+}
+
/*
* Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -213,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec64 *tp)
{
ktime_get_coarse_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -222,18 +236,29 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
+{
+ return ktime_get_boottime();
+}
+
+static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
}
+static ktime_t posix_get_tai_ktime(clockid_t which_clock)
+{
+ return ktime_get_clocktai();
+}
+
static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
@@ -645,7 +670,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
const struct k_clock *kc = timr->kclock;
ktime_t now, remaining, iv;
- struct timespec64 ts64;
bool sig_none;
sig_none = timr->it_sigev_notify == SIGEV_NONE;
@@ -663,12 +687,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
return;
}
- /*
- * The timespec64 based conversion is suboptimal, but it's not
- * worth to implement yet another callback.
- */
- kc->clock_get(timr->it_clock, &ts64);
- now = timespec64_to_ktime(ts64);
+ now = kc->clock_get_ktime(timr->it_clock);
/*
* When a requeue is pending or this is a SIGEV_NONE timer move the
@@ -781,7 +800,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
- * functions which use timr->kclock->clock_get() work.
+ * functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
@@ -866,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
+ if (flags & TIMER_ABSTIME)
+ expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
@@ -1067,7 +1088,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get_timespec(which_clock, &kernel_tp);
if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
@@ -1149,7 +1170,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
if (!kc)
return -EINVAL;
- err = kc->clock_get(which_clock, &ts);
+ err = kc->clock_get_timespec(which_clock, &ts);
if (!err && put_old_timespec32(&ts, tp))
err = -EFAULT;
@@ -1200,7 +1221,22 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
- return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
+static int common_nsleep_timens(const clockid_t which_clock, int flags,
+ const struct timespec64 *rqtp)
+{
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -1261,7 +1297,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_clock_realtime_get,
+ .clock_get_timespec = posix_get_realtime_timespec,
+ .clock_get_ktime = posix_get_realtime_ktime,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1279,8 +1316,9 @@ static const struct k_clock clock_realtime = {
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_ktime_get_ts,
- .nsleep = common_nsleep,
+ .clock_get_timespec = posix_get_monotonic_timespec,
+ .clock_get_ktime = posix_get_monotonic_ktime,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
@@ -1295,22 +1333,23 @@ static const struct k_clock clock_monotonic = {
static const struct k_clock clock_monotonic_raw = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_monotonic_raw,
+ .clock_get_timespec = posix_get_monotonic_raw,
};
static const struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
+ .clock_get_timespec = posix_get_realtime_coarse,
};
static const struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
+ .clock_get_timespec = posix_get_monotonic_coarse,
};
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_tai,
+ .clock_get_ktime = posix_get_tai_ktime,
+ .clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1326,8 +1365,9 @@ static const struct k_clock clock_tai = {
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
- .nsleep = common_nsleep,
+ .clock_get_ktime = posix_get_boottime_ktime,
+ .clock_get_timespec = posix_get_boottime_timespec,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 897c29e162b9..f32a2ebba9b8 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -6,8 +6,11 @@ struct k_clock {
struct timespec64 *tp);
int (*clock_set)(const clockid_t which_clock,
const struct timespec64 *tp);
- int (*clock_get)(const clockid_t which_clock,
- struct timespec64 *tp);
+ /* Returns the clock value in the current time namespace. */
+ int (*clock_get_timespec)(const clockid_t which_clock,
+ struct timespec64 *tp);
+ /* Returns the clock value in the root time namespace. */
+ ktime_t (*clock_get_ktime)(const clockid_t which_clock);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index dbd69052eaa6..e4332e3e2d56 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -169,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- unsigned long r;
+ unsigned long r, flags;
char r_unit;
struct clock_read_data rd;
if (cd.rate > rate)
return;
- WARN_ON(!irqs_disabled());
+ /* Cannot register a sched_clock with interrupts on */
+ local_irq_save(flags);
/* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
@@ -233,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
+ local_irq_restore(flags);
+
pr_debug("Registered %pS as sched_clock source\n", read);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 59225b484e4e..7e5d3524e924 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -11,6 +11,7 @@
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
+#include <linux/nmi.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
@@ -558,6 +559,7 @@ void tick_unfreeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
+ touch_softlockup_watchdog();
tick_resume_local();
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8b192e67aabc..a792d21cac64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Do a quick check without holding jiffies_lock:
+ * The READ_ONCE() pairs with two updates done later in this function.
*/
- delta = ktime_sub(now, last_jiffies_update);
+ delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
@@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now)
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
- last_jiffies_update = ktime_add(last_jiffies_update,
- tick_period);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
@@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now)
ticks = ktime_divns(delta, incr);
- last_jiffies_update = ktime_add_ns(last_jiffies_update,
- incr * ticks);
+ /* Pairs with the lockless read in this function. */
+ WRITE_ONCE(last_jiffies_update,
+ ktime_add_ns(last_jiffies_update,
+ incr * ticks));
}
do_timer(++ticks);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 5ee0f7709410..9577c89179cd 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdata[CS_RAW].mult = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
- /* CLOCK_REALTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
- vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
/* CLOCK_MONOTONIC */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
@@ -70,12 +65,6 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
-
- /*
- * Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
- */
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
}
void update_vsyscall(struct timekeeper *tk)
@@ -84,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk)
struct vdso_timestamp *vdso_ts;
u64 nsec;
- if (__arch_update_vdso_data()) {
- /*
- * Some architectures might want to skip the update of the
- * data page.
- */
- return;
- }
-
/* copy vsyscall data */
vdso_write_begin(vdata);
vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+ /* CLOCK_REALTIME also required for time() */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
@@ -110,7 +96,18 @@ void update_vsyscall(struct timekeeper *tk)
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
- update_vdso_data(vdata, tk);
+ /*
+ * Read without the seqlock held by clock_getres().
+ * Note: No need to have a second copy.
+ */
+ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+
+ /*
+ * Architectures can opt out of updating the high resolution part
+ * of the VDSO.
+ */
+ if (__arch_update_vdso_data())
+ update_vdso_data(vdata, tk);
__arch_update_vsyscall(vdata, tk);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a2659735db73..1af321dec0f1 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -96,6 +96,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
return 0;
}
+/*
+ * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct
+ * functions. But those archs currently don't support direct functions
+ * anyway, and ftrace_find_rec_direct() is just a stub for them.
+ * Define MCOUNT_INSN_SIZE to keep those archs compiling.
+ */
+#ifndef MCOUNT_INSN_SIZE
+/* Make sure this only works without direct calls */
+# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+# error MCOUNT_INSN_SIZE not defined with direct calls enabled
+# endif
+# define MCOUNT_INSN_SIZE 0
+#endif
+
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac99a3500076..9bf1f2cd515e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -526,8 +526,7 @@ static int function_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- avg = rec->time;
- do_div(avg, rec->counter);
+ avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
goto out;
#endif
@@ -553,7 +552,8 @@ static int function_stat_show(struct seq_file *m, void *v)
* Divide only 1000 for ns^2 -> us^2 conversion.
* trace_print_graph_duration will divide 1000 again.
*/
- do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev,
+ rec->counter * (rec->counter - 1) * 1000);
}
trace_seq_init(&s);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c75410f9698..5b6ee4aadc26 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4685,6 +4685,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ if ((mask == TRACE_ITER_RECORD_TGID) ||
+ (mask == TRACE_ITER_RECORD_CMD))
+ lockdep_assert_held(&event_mutex);
+
/* do nothing if flag is already set */
if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
@@ -4752,6 +4756,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
cmp += len;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = match_string(trace_options, -1, cmp);
@@ -4762,6 +4767,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
/*
* If the first trailing whitespace is replaced with '\0' by strstrip,
@@ -8076,9 +8082,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = set_tracer_flag(tr, 1 << index, val);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (ret < 0)
return ret;
@@ -9412,6 +9420,11 @@ __init static int tracing_set_default_clock(void)
{
/* sched_clock_stable() is determined in late_initcall */
if (!trace_boot_clock && !sched_clock_stable()) {
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
+ pr_warn("Can not set tracing clock due to lockdown\n");
+ return -EPERM;
+ }
+
printk(KERN_WARNING
"Unstable clock detected, switching default tracing clock to \"global\"\n"
"If you want to keep using the local clock, then add:\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c6de3cebc127..a5b614cc3887 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -320,7 +320,8 @@ void trace_event_enable_cmd_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
@@ -334,7 +335,6 @@ void trace_event_enable_cmd_record(bool enable)
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
void trace_event_enable_tgid_record(bool enable)
@@ -342,7 +342,8 @@ void trace_event_enable_tgid_record(bool enable)
struct trace_event_file *file;
struct trace_array *tr;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
+
do_for_each_event_file(tr, file) {
if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
@@ -356,7 +357,6 @@ void trace_event_enable_tgid_record(bool enable)
&file->flags);
}
} while_for_each_event_file();
- mutex_unlock(&event_mutex);
}
static int __ftrace_event_enable_disable(struct trace_event_file *file,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c9a74f82b14a..bf44f6bbd0c3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1662,7 +1662,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
- kfree(filter);
+ __free_filter(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
tracepoint_synchronize_unregister();
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f49d1a36d3ae..6ac35b9e195d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -116,6 +116,7 @@ struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
+ unsigned int ref;
unsigned int size;
unsigned int offset;
unsigned int is_signed;
@@ -911,7 +912,26 @@ static notrace void trace_event_raw_event_synth(void *__data,
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
- entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[var_ref_idx + i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
n_u64++;
}
}
@@ -1747,11 +1767,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data,
struct event_trigger_data *test;
struct hist_field *hist_field;
+ lockdep_assert_held(&event_mutex);
+
hist_field = find_var_field(hist_data, var_name);
if (hist_field)
return hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -1801,7 +1823,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file,
struct event_trigger_data *test;
struct hist_field *hist_field;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
test_data = test->private_data;
hist_field = find_var_field(test_data, var_name);
@@ -2404,8 +2428,16 @@ static int contains_operator(char *str)
return field_op;
}
+static void get_hist_field(struct hist_field *hist_field)
+{
+ hist_field->ref++;
+}
+
static void __destroy_hist_field(struct hist_field *hist_field)
{
+ if (--hist_field->ref > 1)
+ return;
+
kfree(hist_field->var.name);
kfree(hist_field->name);
kfree(hist_field->type);
@@ -2447,6 +2479,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field)
return NULL;
+ hist_field->ref = 1;
+
hist_field->hist_data = hist_data;
if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
@@ -2642,6 +2676,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
{
unsigned long flags = HIST_FIELD_FL_VAR_REF;
struct hist_field *ref_field;
+ int i;
+
+ /* Check if the variable already exists */
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ ref_field = hist_data->var_refs[i];
+ if (ref_field->var.idx == var_field->var.idx &&
+ ref_field->var.hist_data == var_field->hist_data) {
+ get_hist_field(ref_field);
+ return ref_field;
+ }
+ }
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
@@ -3096,7 +3141,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
{
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (test->private_data == hist_data)
return test->filter_str;
@@ -3147,9 +3194,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data,
struct event_trigger_data *test;
unsigned int n_keys;
+ lockdep_assert_held(&event_mutex);
+
n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
@@ -5509,7 +5558,7 @@ static int hist_show(struct seq_file *m, void *v)
goto out_unlock;
}
- list_for_each_entry_rcu(data, &event_file->triggers, list) {
+ list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
@@ -5902,7 +5951,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (hist_data->attrs->name && !named_data)
goto new;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -5986,10 +6037,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data,
struct event_trigger_data *test, *named_data = NULL;
bool match = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (hist_trigger_match(data, test, named_data, false)) {
match = true;
@@ -6007,10 +6060,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data,
struct hist_trigger_data *hist_data = data->private_data;
struct event_trigger_data *test, *named_data = NULL;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -6032,10 +6087,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test, *named_data = NULL;
bool unregistered = false;
+ lockdep_assert_held(&event_mutex);
+
if (hist_data->attrs->name)
named_data = find_named_trigger(hist_data->attrs->name);
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (!hist_trigger_match(data, test, named_data, false))
continue;
@@ -6061,7 +6118,9 @@ static bool hist_file_check_refs(struct trace_event_file *file)
struct hist_trigger_data *hist_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
hist_data = test->private_data;
if (check_var_refs(hist_data))
@@ -6304,7 +6363,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
- list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+ list_for_each_entry_rcu(test, &enable_data->file->triggers, list,
+ lockdep_is_held(&event_mutex)) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
if (enable_data->enable)
test->paused = false;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index d45079ee62f8..22bcf7c51d1e 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -195,7 +195,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
unsigned long irq_flags;
void *entry = NULL;
int entry_size;
- u64 val;
+ u64 val = 0;
int len;
entry = trace_alloc_entry(call, &entry_size);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 2cd53ca21b51..40106fff06a4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file)
struct event_trigger_data *data;
bool set_cond = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->filter || event_command_post_trigger(data->cmd_ops) ||
event_command_needs_rec(data->cmd_ops)) {
set_cond = true;
@@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
ret = -EEXIST;
goto out;
@@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
@@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob,
struct event_trigger_data *test;
int ret = 0;
- list_for_each_entry_rcu(test, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(test, &file->triggers, list) {
test_enable_data = test->private_data;
if (test_enable_data &&
(test->cmd_ops->trigger_type ==
@@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *data;
bool unregistered = false;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(data, &file->triggers, list) {
enable_data = data->private_data;
if (enable_data &&
(data->cmd_ops->trigger_type ==
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7f890262c8a3..3f54dc2f6e1c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group);
+ ret = trace_probe_init(&tk->tp, event, group, false);
if (ret < 0)
goto error;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 905b10af5d5c..9ae87be422f2 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -984,15 +984,19 @@ void trace_probe_cleanup(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group)
+ const char *group, bool alloc_filter)
{
struct trace_event_call *call;
+ size_t size = sizeof(struct trace_probe_event);
int ret = 0;
if (!event || !group)
return -EINVAL;
- tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL);
+ if (alloc_filter)
+ size += sizeof(struct trace_uprobe_filter);
+
+ tp->event = kzalloc(size, GFP_KERNEL);
if (!tp->event)
return -ENOMEM;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4ee703728aec..a0ff9e200ef6 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -223,6 +223,12 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
/* Event call and class holder */
struct trace_probe_event {
unsigned int flags; /* For TP_FLAG_* */
@@ -230,6 +236,7 @@ struct trace_probe_event {
struct trace_event_call call;
struct list_head files;
struct list_head probes;
+ struct trace_uprobe_filter filter[0];
};
struct trace_probe {
@@ -322,7 +329,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group);
+ const char *group, bool alloc_filter);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5e43b9664eca..617e297f46dc 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -630,7 +630,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
- return;
+ goto fail_deprobe_sched_switch;
}
wakeup_reset(tr);
@@ -648,6 +648,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
+fail_deprobe_sched_switch:
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
fail_deprobe_wake_new:
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 344e4c1aa09c..87de6edafd14 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -381,7 +381,7 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
const void *buf, size_t len, bool ascii)
{
- unsigned int save_len = s->seq.len;
+ unsigned int save_len = s->seq.len;
if (s->full)
return 0;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4df9a209f7ca..c557f42a9397 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -283,6 +283,11 @@ static void check_stack(unsigned long ip, unsigned long *stack)
local_irq_restore(flags);
}
+/* Some archs may not define MCOUNT_INSN_SIZE */
+#ifndef MCOUNT_INSN_SIZE
+# define MCOUNT_INSN_SIZE 0
+#endif
+
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 352073d36585..2619bc5ed520 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -34,12 +34,6 @@ struct uprobe_trace_entry_head {
#define DATAOF_TRACE_ENTRY(entry, is_return) \
((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
-struct trace_uprobe_filter {
- rwlock_t rwlock;
- int nr_systemwide;
- struct list_head perf_events;
-};
-
static int trace_uprobe_create(int argc, const char **argv);
static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev);
static int trace_uprobe_release(struct dyn_event *ev);
@@ -60,7 +54,6 @@ static struct dyn_event_operations trace_uprobe_ops = {
*/
struct trace_uprobe {
struct dyn_event devent;
- struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
@@ -351,7 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group);
+ ret = trace_probe_init(&tu->tp, event, group, true);
if (ret < 0)
goto error;
@@ -359,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
- init_trace_uprobe_filter(&tu->filter);
+ init_trace_uprobe_filter(tu->tp.event->filter);
return tu;
error:
@@ -1067,13 +1060,14 @@ static void __probe_event_disable(struct trace_probe *tp)
struct trace_probe *pos;
struct trace_uprobe *tu;
+ tu = container_of(tp, struct trace_uprobe, tp);
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
+
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
tu = container_of(pos, struct trace_uprobe, tp);
if (!tu->inode)
continue;
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->inode = NULL;
}
@@ -1108,7 +1102,7 @@ static int probe_event_enable(struct trace_event_call *call,
}
tu = container_of(tp, struct trace_uprobe, tp);
- WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+ WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
if (enabled)
return 0;
@@ -1205,39 +1199,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
}
static inline bool
-uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+trace_uprobe_filter_event(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
- return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
+ return __uprobe_perf_filter(filter, event->hw.target->mm);
}
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
list_del(&event->hw.tp_list);
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
(event->hw.target->flags & PF_EXITING) ||
- uprobe_filter_event(tu, event);
+ trace_uprobe_filter_event(filter, event);
} else {
- tu->filter.nr_systemwide--;
- done = tu->filter.nr_systemwide;
+ filter->nr_systemwide--;
+ done = filter->nr_systemwide;
}
- write_unlock(&tu->filter.rwlock);
+ write_unlock(&filter->rwlock);
- if (!done)
- return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
-
- return 0;
+ return done;
}
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+/* This returns true if the filter always covers target mm */
+static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter,
+ struct perf_event *event)
{
bool done;
- int err;
- write_lock(&tu->filter.rwlock);
+ write_lock(&filter->rwlock);
if (event->hw.target) {
/*
* event->parent != NULL means copy_process(), we can avoid
@@ -1247,28 +1241,21 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
* attr.enable_on_exec means that exec/mmap will install the
* breakpoints we need.
*/
- done = tu->filter.nr_systemwide ||
+ done = filter->nr_systemwide ||
event->parent || event->attr.enable_on_exec ||
- uprobe_filter_event(tu, event);
- list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ trace_uprobe_filter_event(filter, event);
+ list_add(&event->hw.tp_list, &filter->perf_events);
} else {
- done = tu->filter.nr_systemwide;
- tu->filter.nr_systemwide++;
+ done = filter->nr_systemwide;
+ filter->nr_systemwide++;
}
- write_unlock(&tu->filter.rwlock);
+ write_unlock(&filter->rwlock);
- err = 0;
- if (!done) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
- if (err)
- uprobe_perf_close(tu, event);
- }
- return err;
+ return done;
}
-static int uprobe_perf_multi_call(struct trace_event_call *call,
- struct perf_event *event,
- int (*op)(struct trace_uprobe *tu, struct perf_event *event))
+static int uprobe_perf_close(struct trace_event_call *call,
+ struct perf_event *event)
{
struct trace_probe *pos, *tp;
struct trace_uprobe *tu;
@@ -1278,25 +1265,59 @@ static int uprobe_perf_multi_call(struct trace_event_call *call,
if (WARN_ON_ONCE(!tp))
return -ENODEV;
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_remove(tu->tp.event->filter, event))
+ return 0;
+
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
tu = container_of(pos, struct trace_uprobe, tp);
- ret = op(tu, event);
+ ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
if (ret)
break;
}
return ret;
}
+
+static int uprobe_perf_open(struct trace_event_call *call,
+ struct perf_event *event)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_uprobe *tu;
+ int err = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ tu = container_of(tp, struct trace_uprobe, tp);
+ if (trace_uprobe_filter_add(tu->tp.event->filter, event))
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err) {
+ uprobe_perf_close(call, event);
+ break;
+ }
+ }
+
+ return err;
+}
+
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
+ struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
int ret;
tu = container_of(uc, struct trace_uprobe, consumer);
- read_lock(&tu->filter.rwlock);
- ret = __uprobe_perf_filter(&tu->filter, mm);
- read_unlock(&tu->filter.rwlock);
+ filter = tu->tp.event->filter;
+
+ read_lock(&filter->rwlock);
+ ret = __uprobe_perf_filter(filter, mm);
+ read_unlock(&filter->rwlock);
return ret;
}
@@ -1419,10 +1440,10 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
return 0;
case TRACE_REG_PERF_OPEN:
- return uprobe_perf_multi_call(event, data, uprobe_perf_open);
+ return uprobe_perf_open(event, data);
case TRACE_REG_PERF_CLOSE:
- return uprobe_perf_multi_call(event, data, uprobe_perf_close);
+ return uprobe_perf_close(event, data);
#endif
default:
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9a1c22310323..9e31bfc818ff 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
#define DEFINE_TRACING_MAP_CMP_FN(type) \
static int tracing_map_cmp_##type(void *val_a, void *val_b) \
{ \
- type a = *(type *)val_a; \
- type b = *(type *)val_b; \
+ type a = (type)(*(u64 *)val_a); \
+ type b = (type)(*(u64 *)val_b); \
\
return (a > b) ? 1 : ((a < b) ? -1 : 0); \
}
diff --git a/kernel/up.c b/kernel/up.c
index 862b460ab97a..53144d056252 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,8 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
{
unsigned long flags;
@@ -84,11 +83,10 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f41334ef0971..b6b1f54a7837 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void)
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+#define SOFTLOCKUP_RESET ULONG_MAX
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -173,8 +175,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
@@ -274,7 +274,7 @@ notrace void touch_softlockup_watchdog_sched(void)
* Preemption can be enabled. It doesn't matter which CPU's timestamp
* gets zeroed here, so use the raw_ operation.
*/
- raw_cpu_write(watchdog_touch_ts, 0);
+ raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
notrace void touch_softlockup_watchdog(void)
@@ -298,14 +298,14 @@ void touch_all_softlockup_watchdogs(void)
* the softlockup check.
*/
for_each_cpu(cpu, &watchdog_allowed_mask)
- per_cpu(watchdog_touch_ts, cpu) = 0;
+ per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
wq_watchdog_touch(-1);
}
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
- __this_cpu_write(watchdog_touch_ts, 0);
+ __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
}
static int is_softlockup(unsigned long touch_ts)
@@ -350,8 +350,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
*/
static int softlockup_fn(void *data)
{
- __this_cpu_write(soft_lockup_hrtimer_cnt,
- __this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
complete(this_cpu_ptr(&softlockup_completion));
@@ -383,7 +381,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
- if (touch_ts == 0) {
+ if (touch_ts == SOFTLOCKUP_RESET) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
@@ -416,22 +414,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true) {
- /*
- * When multiple processes are causing softlockups the
- * softlockup detector only warns on the first one
- * because the code relies on a full quiet cycle to
- * re-arm. The second process prevents the quiet cycle
- * and never gets reported. Use task pointers to detect
- * this.
- */
- if (__this_cpu_read(softlockup_task_ptr_saved) !=
- current) {
- __this_cpu_write(soft_watchdog_warn, false);
- __touch_watchdog();
- }
+ if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -447,7 +431,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
- __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cfc923558e04..4bdfa270a37b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2266,7 +2266,7 @@ __acquires(&pool->lock)
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
- trace_workqueue_execute_end(work);
+ trace_workqueue_execute_end(work, worker->current_func);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);