aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit.h12
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/bpf/bpf_lsm.c3
-rw-r--r--kernel/bpf/disasm.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/stackmap.c12
-rw-r--r--kernel/bpf/trampoline.c30
-rw-r--r--kernel/bpf/verifier.c235
-rw-r--r--kernel/cfi.c329
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cpuset.c6
-rw-r--r--kernel/cgroup/misc.c407
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c75
-rw-r--r--kernel/debug/kdb/kdb_main.c598
-rw-r--r--kernel/debug/kdb/kdb_private.h5
-rw-r--r--kernel/debug/kdb/kdb_support.c18
-rw-r--r--kernel/entry/common.c4
-rw-r--r--kernel/gcov/clang.c29
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/ipi.c2
-rw-r--r--kernel/irq/irq_sim.c31
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c51
-rw-r--r--kernel/irq/manage.c23
-rw-r--r--kernel/irq/matrix.c11
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq/timings.c8
-rw-r--r--kernel/kallsyms.c55
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/livepatch/transition.c5
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/lockdep_internals.h8
-rw-r--r--kernel/locking/qrwlock.c7
-rw-r--r--kernel/module.c43
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/internal.h7
-rw-r--r--kernel/printk/printk.c478
-rw-r--r--kernel/printk/printk_safe.c30
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c29
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/softirq.c355
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/time/alarmtimer.c6
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c6
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/test_udelay.c7
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-broadcast.c20
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c46
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/vsyscall.c2
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace_dynevent.c6
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c21
82 files changed, 2298 insertions, 979 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 320f1f3941b7..e8a6715f38dc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,6 +41,9 @@ KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
+# Don't instrument error handlers
+CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
+
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -111,6 +114,7 @@ obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/audit.c b/kernel/audit.c
index 551a394bc8f4..121d37e700a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2132,7 +2132,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid(current, &sid);
+ security_task_getsecid_subj(current, &sid);
if (!sid)
return 0;
@@ -2353,7 +2353,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
+ security_task_getsecid_subj(current, &audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index 3b9c0945225a..1522e100fd17 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -292,8 +292,8 @@ extern void audit_filter_inodes(struct task_struct *tsk,
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, t, s) 0
-#define audit_put_watch(w) {}
-#define audit_get_watch(w) {}
+#define audit_put_watch(w) do { } while (0)
+#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
@@ -302,8 +302,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
-#define audit_remove_mark(m)
-#define audit_remove_mark_rule(k)
+#define audit_remove_mark(m) do { } while (0)
+#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
@@ -311,8 +311,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
-#define audit_trim_trees() (void)0
-#define audit_put_tree(tree) (void)0
+#define audit_trim_trees() do { } while (0)
+#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 333b3bcfc545..db2c6b59dfc3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1359,7 +1359,8 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
+ security_task_getsecid_subj(current,
+ &sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47fb48f42c93..175ef6f3ea4e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -667,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid(tsk, &sid);
+ security_task_getsecid_subj(tsk, &sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -805,8 +805,7 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
* (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
static void audit_filter_syscall(struct task_struct *tsk,
- struct audit_context *ctx,
- struct list_head *list)
+ struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
@@ -815,7 +814,7 @@ static void audit_filter_syscall(struct task_struct *tsk,
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, list, list) {
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
@@ -1627,8 +1626,7 @@ void __audit_free(struct task_struct *tsk)
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
- audit_filter_syscall(tsk, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_syscall(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit();
@@ -1735,8 +1733,7 @@ void __audit_syscall_exit(int success, long return_code)
else
context->return_code = return_code;
- audit_filter_syscall(current, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
if (context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit();
@@ -2400,7 +2397,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &context->target_sid);
+ security_task_getsecid_obj(t, &context->target_sid);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2427,7 +2424,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &ctx->target_sid);
+ security_task_getsecid_obj(t, &ctx->target_sid);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2448,7 +2445,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 1622a44d1617..0ff58259ccf8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -209,7 +209,8 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid)
+BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..faa54d58972c 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -84,7 +84,7 @@ static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
[BPF_OR >> 4] = "or",
- [BPF_XOR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
};
static const char *const bpf_ldst_string[] = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1576ff331ee4..d2de2abec35b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -543,11 +543,11 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
return PTR_ERR(raw);
if (type == BPF_TYPE_PROG)
- ret = bpf_prog_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
ret = bpf_map_new_fd(raw, f_flags);
else if (type == BPF_TYPE_LINK)
- ret = bpf_link_new_fd(raw);
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
else
return -ENOENT;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be35bfb7fb13..6fbc2abe9c91 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -517,9 +517,17 @@ const struct bpf_func_proto bpf_get_stack_proto = {
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
- struct pt_regs *regs = task_pt_regs(task);
+ struct pt_regs *regs;
+ long res;
- return __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (!try_get_task_stack(task))
+ return -EFAULT;
+
+ regs = task_pt_regs(task);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ put_task_stack(task);
+
+ return res;
}
BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 1f3a4be4b175..4aa8b52adf25 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,6 +9,7 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
+#include <linux/module.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -87,6 +88,26 @@ out:
return tr;
}
+static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
+{
+ struct module *mod;
+ int err = 0;
+
+ preempt_disable();
+ mod = __module_text_address((unsigned long) tr->func.addr);
+ if (mod && !try_module_get(mod))
+ err = -ENOENT;
+ preempt_enable();
+ tr->mod = mod;
+ return err;
+}
+
+static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
+{
+ module_put(tr->mod);
+ tr->mod = NULL;
+}
+
static int is_ftrace_location(void *ip)
{
long addr;
@@ -108,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
ret = unregister_ftrace_direct((long)ip, (long)old_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+
+ if (!ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
@@ -134,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
return ret;
tr->func.ftrace_managed = ret;
+ if (bpf_trampoline_module_get(tr))
+ return -ENOENT;
+
if (tr->func.ftrace_managed)
ret = register_ftrace_direct((long)ip, (long)new_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+
+ if (ret)
+ bpf_trampoline_module_put(tr);
return ret;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 44e4ec1640f1..0399ac092b36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5856,40 +5856,51 @@ static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
return &env->insn_aux_data[env->insn_idx];
}
+enum {
+ REASON_BOUNDS = -1,
+ REASON_TYPE = -2,
+ REASON_PATHS = -3,
+ REASON_LIMIT = -4,
+ REASON_STACK = -5,
+};
+
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
- u32 *ptr_limit, u8 opcode, bool off_is_neg)
+ const struct bpf_reg_state *off_reg,
+ u32 *alu_limit, u8 opcode)
{
+ bool off_is_neg = off_reg->smin_value < 0;
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
- u32 off, max;
+ u32 max = 0, ptr_limit = 0;
+
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
switch (ptr_reg->type) {
case PTR_TO_STACK:
/* Offset 0 is out-of-bounds, but acceptable start for the
- * left direction, see BPF_REG_FP.
+ * left direction, see BPF_REG_FP. Also, unknown scalar
+ * offset where we would need to deal with min/max bounds is
+ * currently prohibited for unprivileged.
*/
max = MAX_BPF_STACK + mask_to_left;
- /* Indirect variable offset stack access is prohibited in
- * unprivileged mode so it's not handled here.
- */
- off = ptr_reg->off + ptr_reg->var_off.value;
- if (mask_to_left)
- *ptr_limit = MAX_BPF_STACK + off;
- else
- *ptr_limit = -off - 1;
- return *ptr_limit >= max ? -ERANGE : 0;
+ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
- if (mask_to_left) {
- *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
- } else {
- off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
- }
- return *ptr_limit >= max ? -ERANGE : 0;
+ ptr_limit = (mask_to_left ?
+ ptr_reg->smin_value :
+ ptr_reg->umax_value) + ptr_reg->off;
+ break;
default:
- return -EINVAL;
+ return REASON_TYPE;
}
+
+ if (ptr_limit >= max)
+ return REASON_LIMIT;
+ *alu_limit = ptr_limit;
+ return 0;
}
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
@@ -5907,7 +5918,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
if (aux->alu_state &&
(aux->alu_state != alu_state ||
aux->alu_limit != alu_limit))
- return -EACCES;
+ return REASON_PATHS;
/* Corresponding fixup done in fixup_bpf_calls(). */
aux->alu_state = alu_state;
@@ -5926,14 +5937,22 @@ static int sanitize_val_alu(struct bpf_verifier_env *env,
return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}
+static bool sanitize_needed(u8 opcode)
+{
+ return opcode == BPF_ADD || opcode == BPF_SUB;
+}
+
static int sanitize_ptr_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn,
const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
struct bpf_reg_state *dst_reg,
- bool off_is_neg)
+ struct bpf_insn_aux_data *tmp_aux,
+ const bool commit_window)
{
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool off_is_neg = off_reg->smin_value < 0;
bool ptr_is_dst_reg = ptr_reg == dst_reg;
u8 opcode = BPF_OP(insn->code);
u32 alu_state, alu_limit;
@@ -5951,18 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
if (vstate->speculative)
goto do_sim;
- alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
- alu_state |= ptr_is_dst_reg ?
- BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
-
- err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+ err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
if (err < 0)
return err;
+ if (commit_window) {
+ /* In commit phase we narrow the masking window based on
+ * the observed pointer move after the simulated operation.
+ */
+ alu_state = tmp_aux->alu_state;
+ alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+ } else {
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+ }
+
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
if (err < 0)
return err;
do_sim:
+ /* If we're in commit phase, we're done here given we already
+ * pushed the truncated dst_reg into the speculative verification
+ * stack.
+ */
+ if (commit_window)
+ return 0;
+
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
* masking when off was not within expected range. If off
@@ -5979,7 +6013,46 @@ do_sim:
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
- return !ret ? -EFAULT : 0;
+ return !ret ? REASON_STACK : 0;
+}
+
+static int sanitize_err(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int reason,
+ const struct bpf_reg_state *off_reg,
+ const struct bpf_reg_state *dst_reg)
+{
+ static const char *err = "pointer arithmetic with it prohibited for !root";
+ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
+ u32 dst = insn->dst_reg, src = insn->src_reg;
+
+ switch (reason) {
+ case REASON_BOUNDS:
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
+ off_reg == dst_reg ? dst : src, err);
+ break;
+ case REASON_TYPE:
+ verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
+ off_reg == dst_reg ? src : dst, err);
+ break;
+ case REASON_PATHS:
+ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
+ dst, op, err);
+ break;
+ case REASON_LIMIT:
+ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
+ dst, op, err);
+ break;
+ case REASON_STACK:
+ verbose(env, "R%d could not be pushed for speculative verification, %s\n",
+ dst, err);
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reason (%d)\n",
+ reason);
+ break;
+ }
+
+ return -EACCES;
}
/* check that stack access falls within stack limits and that 'reg' doesn't
@@ -6016,6 +6089,37 @@ static int check_stack_access_for_ptr_arithmetic(
return 0;
}
+static int sanitize_check_bounds(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ const struct bpf_reg_state *dst_reg)
+{
+ u32 dst = insn->dst_reg;
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (env->bypass_spec_v1)
+ return 0;
+
+ switch (dst_reg->type) {
+ case PTR_TO_STACK:
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
+ dst_reg->off + dst_reg->var_off.value))
+ return -EACCES;
+ break;
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
@@ -6035,8 +6139,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
- u32 dst = insn->dst_reg, src = insn->src_reg;
+ struct bpf_insn_aux_data tmp_aux = {};
u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
int ret;
dst_reg = &regs[dst];
@@ -6084,13 +6189,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
- case PTR_TO_MAP_VALUE:
- if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
- verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
- off_reg == dst_reg ? dst : src);
- return -EACCES;
- }
- fallthrough;
default:
break;
}
@@ -6108,13 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* pointer types do not carry 32-bit bounds at the moment. */
__mark_reg32_unbounded(dst_reg);
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+ &tmp_aux, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
switch (opcode) {
case BPF_ADD:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
- return ret;
- }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -6165,11 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
break;
case BPF_SUB:
- ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
- return ret;
- }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -6250,21 +6345,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
- /* For unprivileged we require that resulting offset must be in bounds
- * in order to be able to sanitize access later on.
- */
- if (!env->bypass_spec_v1) {
- if (dst_reg->type == PTR_TO_MAP_VALUE &&
- check_map_access(env, dst, dst_reg->off, 1, false)) {
- verbose(env, "R%d pointer arithmetic of map value goes out of range, "
- "prohibited for !root\n", dst);
- return -EACCES;
- } else if (dst_reg->type == PTR_TO_STACK &&
- check_stack_access_for_ptr_arithmetic(
- env, dst, dst_reg, dst_reg->off +
- dst_reg->var_off.value)) {
- return -EACCES;
- }
+ if (sanitize_check_bounds(env, insn, dst_reg) < 0)
+ return -EACCES;
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+ &tmp_aux, true);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
return 0;
@@ -6858,9 +6945,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
s32 s32_min_val, s32_max_val;
u32 u32_min_val, u32_max_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- u32 dst = insn->dst_reg;
- int ret;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ int ret;
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
@@ -6902,6 +6988,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
return 0;
}
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, NULL, NULL);
+ }
+
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
* There are two classes of instructions: The first class we track both
* alu32 and alu64 sign/unsigned bounds independently this provides the
@@ -6918,21 +7010,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
switch (opcode) {
case BPF_ADD:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
- ret = sanitize_val_alu(env, insn);
- if (ret < 0) {
- verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
- return ret;
- }
scalar32_min_max_sub(dst_reg, &src_reg);
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
@@ -12158,6 +12240,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
u32 btf_id, member_idx;
const char *mname;
+ if (!prog->gpl_compatible) {
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
btf_id = prog->aux->attach_btf_id;
st_ops = bpf_struct_ops_find(btf_id);
if (!st_ops) {
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..e17a56639766
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ *
+ * Copyright (C) 2021 Google LLC
+ */
+
+#include <linux/hardirq.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+
+/* Compiler-defined handler names */
+#ifdef CONFIG_CFI_PERMISSIVE
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail
+#else
+#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
+#endif
+
+static inline void handle_cfi_failure(void *ptr)
+{
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+ WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ else
+ panic("CFI failure (target: %pS)\n", ptr);
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_CFI_CLANG_SHADOW
+/*
+ * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
+ * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
+ */
+typedef u16 shadow_t;
+#define SHADOW_INVALID ((shadow_t)~0UL)
+
+struct cfi_shadow {
+ /* Page index for the beginning of the shadow */
+ unsigned long base;
+ /* An array of __cfi_check locations (as indices to the shadow) */
+ shadow_t shadow[1];
+} __packed;
+
+/*
+ * The shadow covers ~128M from the beginning of the module region. If
+ * the region is larger, we fall back to __module_address for the rest.
+ */
+#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
+
+/* The in-memory size of struct cfi_shadow, always at least one page */
+#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
+#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
+#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
+
+/* The actual size of the shadow array, minus metadata */
+#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
+#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
+
+static DEFINE_MUTEX(shadow_update_lock);
+static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
+
+/* Returns the index in the shadow for the given address */
+static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
+{
+ unsigned long index;
+ unsigned long page = ptr >> PAGE_SHIFT;
+
+ if (unlikely(page < s->base))
+ return -1; /* Outside of module area */
+
+ index = page - s->base;
+
+ if (index >= SHADOW_ARR_SLOTS)
+ return -1; /* Cannot be addressed with shadow */
+
+ return (int)index;
+}
+
+/* Returns the page address for an index in the shadow */
+static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
+ int index)
+{
+ if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+ return 0;
+
+ return (s->base + index) << PAGE_SHIFT;
+}
+
+/* Returns the __cfi_check function address for the given shadow location */
+static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
+ int index)
+{
+ if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+ return 0;
+
+ if (unlikely(s->shadow[index] == SHADOW_INVALID))
+ return 0;
+
+ /* __cfi_check is always page aligned */
+ return (s->base + s->shadow[index]) << PAGE_SHIFT;
+}
+
+static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
+ struct cfi_shadow *next)
+{
+ int i, index, check;
+
+ /* Mark everything invalid */
+ memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
+
+ if (!prev)
+ return; /* No previous shadow */
+
+ /* If the base address didn't change, an update is not needed */
+ if (prev->base == next->base) {
+ memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
+ return;
+ }
+
+ /* Convert the previous shadow to the new address range */
+ for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
+ if (prev->shadow[i] == SHADOW_INVALID)
+ continue;
+
+ index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
+ if (index < 0)
+ continue;
+
+ check = ptr_to_shadow(next,
+ shadow_to_check_fn(prev, prev->shadow[i]));
+ if (check < 0)
+ continue;
+
+ next->shadow[index] = (shadow_t)check;
+ }
+}
+
+static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
+ unsigned long min_addr, unsigned long max_addr)
+{
+ int check_index;
+ unsigned long check = (unsigned long)mod->cfi_check;
+ unsigned long ptr;
+
+ if (unlikely(!PAGE_ALIGNED(check))) {
+ pr_warn("cfi: not using shadow for module %s\n", mod->name);
+ return;
+ }
+
+ check_index = ptr_to_shadow(s, check);
+ if (check_index < 0)
+ return; /* Module not addressable with shadow */
+
+ /* For each page, store the check function index in the shadow */
+ for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+
+ if (index >= 0) {
+ /* Each page must only contain one module */
+ WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
+ s->shadow[index] = (shadow_t)check_index;
+ }
+ }
+}
+
+static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
+ unsigned long min_addr, unsigned long max_addr)
+{
+ unsigned long ptr;
+
+ for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+ int index = ptr_to_shadow(s, ptr);
+
+ if (index >= 0)
+ s->shadow[index] = SHADOW_INVALID;
+ }
+}
+
+typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
+ unsigned long min_addr, unsigned long max_addr);
+
+static void update_shadow(struct module *mod, unsigned long base_addr,
+ update_shadow_fn fn)
+{
+ struct cfi_shadow *prev;
+ struct cfi_shadow *next;
+ unsigned long min_addr, max_addr;
+
+ next = vmalloc(SHADOW_SIZE);
+
+ mutex_lock(&shadow_update_lock);
+ prev = rcu_dereference_protected(cfi_shadow,
+ mutex_is_locked(&shadow_update_lock));
+
+ if (next) {
+ next->base = base_addr >> PAGE_SHIFT;
+ prepare_next_shadow(prev, next);
+
+ min_addr = (unsigned long)mod->core_layout.base;
+ max_addr = min_addr + mod->core_layout.text_size;
+ fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
+
+ set_memory_ro((unsigned long)next, SHADOW_PAGES);
+ }
+
+ rcu_assign_pointer(cfi_shadow, next);
+ mutex_unlock(&shadow_update_lock);
+ synchronize_rcu();
+
+ if (prev) {
+ set_memory_rw((unsigned long)prev, SHADOW_PAGES);
+ vfree(prev);
+ }
+}
+
+void cfi_module_add(struct module *mod, unsigned long base_addr)
+{
+ update_shadow(mod, base_addr, add_module_to_shadow);
+}
+
+void cfi_module_remove(struct module *mod, unsigned long base_addr)
+{
+ update_shadow(mod, base_addr, remove_module_from_shadow);
+}
+
+static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
+ unsigned long ptr)
+{
+ int index;
+
+ if (unlikely(!s))
+ return NULL; /* No shadow available */
+
+ index = ptr_to_shadow(s, ptr);
+ if (index < 0)
+ return NULL; /* Cannot be addressed with shadow */
+
+ return (cfi_check_fn)shadow_to_check_fn(s, index);
+}
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn;
+
+ rcu_read_lock_sched();
+ fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
+ rcu_read_unlock_sched();
+
+ return fn;
+}
+
+#else /* !CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn = NULL;
+ struct module *mod;
+
+ rcu_read_lock_sched();
+ mod = __module_address(ptr);
+ if (mod)
+ fn = mod->cfi_check;
+ rcu_read_unlock_sched();
+
+ return fn;
+}
+
+static inline cfi_check_fn find_check_fn(unsigned long ptr)
+{
+ cfi_check_fn fn = NULL;
+
+ if (is_kernel_text(ptr))
+ return __cfi_check;
+
+ /*
+ * Indirect call checks can happen when RCU is not watching. Both
+ * the shadow and __module_address use RCU, so we need to wake it
+ * up if necessary.
+ */
+ RCU_NONIDLE({
+ if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
+ fn = find_shadow_check_fn(ptr);
+
+ if (!fn)
+ fn = find_module_check_fn(ptr);
+ });
+
+ return fn;
+}
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+ cfi_check_fn fn = find_check_fn((unsigned long)ptr);
+
+ if (likely(fn))
+ fn(id, ptr, diag);
+ else /* Don't allow unchecked modules */
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#else /* !CONFIG_MODULES */
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+ handle_cfi_failure(ptr); /* No modules */
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#endif /* CONFIG_MODULES */
+
+void cfi_failure_handler(void *data, void *ptr, void *vtable)
+{
+ handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(cfi_failure_handler);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..12f8457ad1f9 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a5751784ad74..391aa570369b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -727,7 +727,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
stats->nr_stopped++;
break;
default:
- if (delayacct_is_task_waiting_on_io(tsk))
+ if (tsk->in_iowait)
stats->nr_io_wait++;
break;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5258b68153e0..a945504c0ae7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -585,7 +585,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
- /* On legacy hiearchy, we must be a subset of our parent cpuset. */
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
@@ -1726,7 +1726,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* When configured nodemask is changed, the effective nodemasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hiearchy, effective_mems will be the same with mems_allowed.
+ * On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
@@ -2500,7 +2500,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
BUG();
}
- /* Unrechable but makes gcc happy */
+ /* Unreachable but makes gcc happy */
return 0;
}
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644
index 000000000000..ec02d963cad1
--- /dev/null
+++ b/kernel/cgroup/misc.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM ULONG_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+ /* AMD SEV ASIDs resource */
+ "sev",
+ /* AMD SEV-ES ASIDs resource */
+ "sev_es",
+#endif
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+ return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+ return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_res_total_usage() - Get the current total usage of the resource.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return: Current total usage of the resource.
+ */
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+ if (valid_type(type))
+ return atomic_long_read(&root_cg.res[type].usage);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+{
+ if (!valid_type(type))
+ return -EINVAL;
+
+ WRITE_ONCE(misc_res_capacity[type], capacity);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+ "misc cgroup resource %s became less than 0",
+ misc_res_name[type]);
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ * capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ struct misc_cg *i, *j;
+ int ret;
+ struct misc_res *res;
+ int new_usage;
+
+ if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+ return -EINVAL;
+
+ if (!amount)
+ return 0;
+
+ for (i = cg; i; i = parent_misc(i)) {
+ res = &i->res[type];
+
+ new_usage = atomic_long_add_return(amount, &res->usage);
+ if (new_usage > READ_ONCE(res->max) ||
+ new_usage > READ_ONCE(misc_res_capacity[type])) {
+ if (!res->failed) {
+ pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
+ misc_res_name[type]);
+ pr_cont_cgroup_path(i->css.cgroup);
+ pr_cont("\n");
+ res->failed = true;
+ }
+ ret = -EBUSY;
+ goto err_charge;
+ }
+ }
+ return 0;
+
+err_charge:
+ for (j = cg; j != i; j = parent_misc(j))
+ misc_cg_cancel_charge(type, j, amount);
+ misc_cg_cancel_charge(type, i, amount);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ struct misc_cg *i;
+
+ if (!(amount && valid_type(type) && cg))
+ return;
+
+ for (i = cg; i; i = parent_misc(i))
+ misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+ int i;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long max;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (READ_ONCE(misc_res_capacity[i])) {
+ max = READ_ONCE(cg->res[i].max);
+ if (max == MAX_NUM)
+ seq_printf(sf, "%s max\n", misc_res_name[i]);
+ else
+ seq_printf(sf, "%s %lu\n", misc_res_name[i],
+ max);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ * integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the unsigned long capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ unsigned long max;
+ int ret = 0, i;
+ enum misc_res_type type = MISC_CG_RES_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (!strcmp(misc_res_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_RES_TYPES)
+ return -EINVAL;
+
+ if (!strcmp(MAX_STR, buf)) {
+ max = MAX_NUM;
+ } else {
+ ret = kstrtoul(buf, 0, &max);
+ if (ret)
+ return ret;
+ }
+
+ cg = css_misc(of_css(of));
+
+ if (READ_ONCE(misc_res_capacity[type]))
+ WRITE_ONCE(cg->res[type].max, max);
+ else
+ ret = -EINVAL;
+
+ return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+ int i;
+ unsigned long usage;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ usage = atomic_long_read(&cg->res[i].usage);
+ if (READ_ONCE(misc_res_capacity[i]) || usage)
+ seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+ int i;
+ unsigned long cap;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ cap = READ_ONCE(misc_res_capacity[i]);
+ if (cap)
+ seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+ }
+
+ return 0;
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+ {
+ .name = "max",
+ .write = misc_cg_max_write,
+ .seq_show = misc_cg_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = misc_cg_current_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "capacity",
+ .seq_show = misc_cg_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+ enum misc_res_type i;
+ struct misc_cg *cg;
+
+ if (!parent_css) {
+ cg = &root_cg;
+ } else {
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ WRITE_ONCE(cg->res[i].max, MAX_NUM);
+ atomic_long_set(&cg->res[i].usage, 0);
+ }
+
+ return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+ .css_alloc = misc_cg_alloc,
+ .css_free = misc_cg_free,
+ .legacy_cftypes = misc_cg_files,
+ .dfl_cftypes = misc_cg_files,
+};
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e149a0ac9e9e..8372897402f4 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -321,7 +321,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
/*
* Copy the binary array pointed to by buf into mem. Fix $, #, and
* 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
+ * The input buf is overwritten with the result to write to mem.
*/
static int kgdb_ebin2mem(char *buf, char *mem, int count)
{
@@ -952,7 +952,7 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
}
/*
- * This function performs all gdbserial command procesing
+ * This function performs all gdbserial command processing
*/
int gdb_serial_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index ec4940146612..2168f8dacb99 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -522,6 +522,54 @@ static int kdb_ss(int argc, const char **argv)
return KDB_CMD_SS;
}
+static kdbtab_t bptab[] = {
+ { .cmd_name = "bp",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Set/Display breakpoints",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "bl",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Display breakpoints",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "bc",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Clear Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "be",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Enable Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "bd",
+ .cmd_func = kdb_bc,
+ .cmd_usage = "<bpnum>",
+ .cmd_help = "Disable Breakpoint",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .cmd_name = "ss",
+ .cmd_func = kdb_ss,
+ .cmd_usage = "",
+ .cmd_help = "Single Step",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+};
+
+static kdbtab_t bphcmd = {
+ .cmd_name = "bph",
+ .cmd_func = kdb_bp,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "[datar [length]|dataw [length]] Set hw brk",
+ .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+};
+
/* Initialize the breakpoint table and register breakpoint commands. */
void __init kdb_initbptab(void)
@@ -537,30 +585,7 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_table(bptab, ARRAY_SIZE(bptab));
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
-
- kdb_register_flags("ss", kdb_ss, "",
- "Single Step", 1,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- /*
- * Architecture dependent initialization.
- */
+ kdb_register_table(&bphcmd, 1);
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 930ac1b25ec7..1baa96a2ecb8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -33,6 +33,7 @@
#include <linux/kallsyms.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
@@ -84,15 +85,8 @@ static unsigned int kdb_continue_catastrophic =
static unsigned int kdb_continue_catastrophic;
#endif
-/* kdb_commands describes the available commands. */
-static kdbtab_t *kdb_commands;
-#define KDB_BASE_CMD_MAX 50
-static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
-#define for_each_kdbcmd(cmd, num) \
- for ((cmd) = kdb_base_commands, (num) = 0; \
- num < kdb_max_commands; \
- num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+/* kdb_cmds_head describes the available commands. */
+static LIST_HEAD(kdb_cmds_head);
typedef struct _kdbmsg {
int km_diag; /* kdb diagnostic */
@@ -146,42 +140,18 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
* KDB_ENVBUFSIZE if required).
*/
-static char *__env[] = {
+static char *__env[31] = {
#if defined(CONFIG_SMP)
- "PROMPT=[%d]kdb> ",
+ "PROMPT=[%d]kdb> ",
#else
- "PROMPT=kdb> ",
+ "PROMPT=kdb> ",
#endif
- "MOREPROMPT=more> ",
- "RADIX=16",
- "MDCOUNT=8", /* lines of md output */
- KDB_PLATFORM_ENV,
- "DTABCOUNT=30",
- "NOSECT=1",
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
};
static const int __nenv = ARRAY_SIZE(__env);
@@ -324,6 +294,63 @@ int kdbgetintenv(const char *match, int *value)
}
/*
+ * kdb_setenv() - Alter an existing environment variable or create a new one.
+ * @var: Name of the variable
+ * @val: Value of the variable
+ *
+ * Return: Zero on success, a kdb diagnostic on failure.
+ */
+static int kdb_setenv(const char *var, const char *val)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ varlen = strlen(var);
+ vallen = strlen(val);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", var, val);
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], var, varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+/*
+ * kdb_printenv() - Display the current environment variables.
+ */
+static void kdb_printenv(void)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+}
+
+/*
* kdbgetularg - This function will convert a numeric string into an
* unsigned long value.
* Parameters:
@@ -380,10 +407,6 @@ int kdbgetu64arg(const char *arg, u64 *value)
*/
int kdb_set(int argc, const char **argv)
{
- int i;
- char *ep;
- size_t varlen, vallen;
-
/*
* we can be invoked two ways:
* set var=value argv[1]="var", argv[2]="value"
@@ -428,37 +451,7 @@ int kdb_set(int argc, const char **argv)
* Tokenizer squashed the '=' sign. argv[1] is variable
* name, argv[2] = value.
*/
- varlen = strlen(argv[1]);
- vallen = strlen(argv[2]);
- ep = kdballocenv(varlen + vallen + 2);
- if (ep == (char *)0)
- return KDB_ENVBUFFULL;
-
- sprintf(ep, "%s=%s", argv[1], argv[2]);
-
- ep[varlen+vallen+1] = '\0';
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i]
- && ((strncmp(__env[i], argv[1], varlen) == 0)
- && ((__env[i][varlen] == '\0')
- || (__env[i][varlen] == '=')))) {
- __env[i] = ep;
- return 0;
- }
- }
-
- /*
- * Wasn't existing variable. Fit into slot.
- */
- for (i = 0; i < __nenv-1; i++) {
- if (__env[i] == (char *)0) {
- __env[i] = ep;
- return 0;
- }
- }
-
- return KDB_ENVFULL;
+ return kdb_setenv(argv[1], argv[2]);
}
static int kdb_check_regs(void)
@@ -921,7 +914,7 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep = 0;
+ int escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
@@ -1011,25 +1004,17 @@ int kdb_parse(const char *cmdstr)
++argv[0];
}
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- /*
- * If this command is allowed to be abbreviated,
- * check to see if this is it.
- */
-
- if (tp->cmd_minlen
- && (strlen(argv[0]) <= tp->cmd_minlen)) {
- if (strncmp(argv[0],
- tp->cmd_name,
- tp->cmd_minlen) == 0) {
- break;
- }
- }
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+ if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) &&
+ (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0))
+ break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
- break;
- }
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
}
/*
@@ -1037,19 +1022,15 @@ int kdb_parse(const char *cmdstr)
* few characters of this match any of the known commands.
* e.g., md1c20 should match md.
*/
- if (i == kdb_max_commands) {
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- if (strncmp(argv[0],
- tp->cmd_name,
- strlen(tp->cmd_name)) == 0) {
- break;
- }
- }
+ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ if (strncmp(argv[0], tp->cmd_name,
+ strlen(tp->cmd_name)) == 0)
+ break;
}
}
- if (i < kdb_max_commands) {
+ if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
@@ -2073,12 +2054,7 @@ static int kdb_lsmod(int argc, const char **argv)
static int kdb_env(int argc, const char **argv)
{
- int i;
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i])
- kdb_printf("%s\n", __env[i]);
- }
+ kdb_printenv();
if (KDB_DEBUG(MASK))
kdb_printf("KDBDEBUG=0x%x\n",
@@ -2101,7 +2077,7 @@ static int kdb_dmesg(int argc, const char **argv)
int adjust = 0;
int n = 0;
int skip = 0;
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dump_iter iter;
size_t len;
char buf[201];
@@ -2126,8 +2102,8 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2159,8 +2135,8 @@ static int kdb_dmesg(int argc, const char **argv)
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
@@ -2428,17 +2404,14 @@ static int kdb_kgdb(int argc, const char **argv)
static int kdb_help(int argc, const char **argv)
{
kdbtab_t *kt;
- int i;
kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
kdb_printf("-----------------------------"
"-----------------------------\n");
- for_each_kdbcmd(kt, i) {
+ list_for_each_entry(kt, &kdb_cmds_head, list_node) {
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kt->cmd_name)
- continue;
if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
continue;
if (strlen(kt->cmd_usage) > 20)
@@ -2659,7 +2632,6 @@ static int kdb_grep_help(int argc, const char **argv)
* Returns:
* zero for success, one if a duplicate command.
*/
-#define kdb_command_extend 50 /* arbitrary */
int kdb_register_flags(char *cmd,
kdb_func_t func,
char *usage,
@@ -2667,49 +2639,20 @@ int kdb_register_flags(char *cmd,
short minlen,
kdb_cmdflags_t flags)
{
- int i;
kdbtab_t *kp;
- /*
- * Brute force method to determine duplicates
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->cmd_name, cmd) == 0) {
kdb_printf("Duplicate kdb command registered: "
"%s, func %px help %s\n", cmd, func, help);
return 1;
}
}
- /*
- * Insert command into first available location in table
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name == NULL)
- break;
- }
-
- if (i >= kdb_max_commands) {
- kdbtab_t *new = kmalloc_array(kdb_max_commands -
- KDB_BASE_CMD_MAX +
- kdb_command_extend,
- sizeof(*new),
- GFP_KDB);
- if (!new) {
- kdb_printf("Could not allocate new kdb_command "
- "table\n");
- return 1;
- }
- if (kdb_commands) {
- memcpy(new, kdb_commands,
- (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
- kfree(kdb_commands);
- }
- memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
- kdb_command_extend * sizeof(*new));
- kdb_commands = new;
- kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
- kdb_max_commands += kdb_command_extend;
+ kp = kmalloc(sizeof(*kp), GFP_KDB);
+ if (!kp) {
+ kdb_printf("Could not allocate new kdb_command table\n");
+ return 1;
}
kp->cmd_name = cmd;
@@ -2718,11 +2661,27 @@ int kdb_register_flags(char *cmd,
kp->cmd_help = help;
kp->cmd_minlen = minlen;
kp->cmd_flags = flags;
+ kp->is_dynamic = true;
+
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
return 0;
}
EXPORT_SYMBOL_GPL(kdb_register_flags);
+/*
+ * kdb_register_table() - This function is used to register a kdb command
+ * table.
+ * @kp: pointer to kdb command table
+ * @len: length of kdb command table
+ */
+void kdb_register_table(kdbtab_t *kp, size_t len)
+{
+ while (len--) {
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
+ kp++;
+ }
+}
/*
* kdb_register - Compatibility register function for commands that do
@@ -2757,15 +2716,16 @@ EXPORT_SYMBOL_GPL(kdb_register);
*/
int kdb_unregister(char *cmd)
{
- int i;
kdbtab_t *kp;
/*
* find the command.
*/
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kp->cmd_name = NULL;
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->cmd_name, cmd) == 0) {
+ list_del(&kp->list_node);
+ if (kp->is_dynamic)
+ kfree(kp);
return 0;
}
}
@@ -2775,118 +2735,222 @@ int kdb_unregister(char *cmd)
}
EXPORT_SYMBOL_GPL(kdb_unregister);
-/* Initialize the kdb command table. */
-static void __init kdb_inittab(void)
-{
- int i;
- kdbtab_t *kp;
-
- for_each_kdbcmd(kp, i)
- kp->cmd_name = NULL;
-
- kdb_register_flags("md", kdb_md, "<vaddr>",
- "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0,
- KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1,
- KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("rd", kdb_rd, "",
- "Display Registers", 0,
- KDB_ENABLE_REG_READ);
- kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0,
- KDB_ENABLE_REG_WRITE);
- kdb_register_flags("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btt", kdb_bt, "<vaddr>",
- "Backtrace process given its struct task address", 0,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("env", kdb_env, "",
- "Show environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("set", kdb_set, "",
- "Set environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("help", kdb_help, "",
- "Display Help Message", 1,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("?", kdb_help, "",
- "Display Help Message", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0,
- KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, 0);
- kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0,
- KDB_ENABLE_REBOOT);
+static kdbtab_t maintab[] = {
+ { .cmd_name = "md",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mdr",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr> <bytes>",
+ .cmd_help = "Display Raw Memory",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mdp",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<paddr> <bytes>",
+ .cmd_help = "Display Physical Memory",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mds",
+ .cmd_func = kdb_md,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display Memory Symbolically",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "mm",
+ .cmd_func = kdb_mm,
+ .cmd_usage = "<vaddr> <contents>",
+ .cmd_help = "Modify Memory Contents",
+ .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ },
+ { .cmd_name = "go",
+ .cmd_func = kdb_go,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Continue Execution",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .cmd_name = "rd",
+ .cmd_func = kdb_rd,
+ .cmd_usage = "",
+ .cmd_help = "Display Registers",
+ .cmd_flags = KDB_ENABLE_REG_READ,
+ },
+ { .cmd_name = "rm",
+ .cmd_func = kdb_rm,
+ .cmd_usage = "<reg> <contents>",
+ .cmd_help = "Modify Registers",
+ .cmd_flags = KDB_ENABLE_REG_WRITE,
+ },
+ { .cmd_name = "ef",
+ .cmd_func = kdb_ef,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Display exception frame",
+ .cmd_flags = KDB_ENABLE_MEM_READ,
+ },
+ { .cmd_name = "bt",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "[<vaddr>]",
+ .cmd_help = "Stack traceback",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .cmd_name = "btp",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "<pid>",
+ .cmd_help = "Display stack for process <pid>",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "bta",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
+ .cmd_help = "Backtrace all processes matching state flag",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "btc",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "",
+ .cmd_help = "Backtrace current process on each cpu",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "btt",
+ .cmd_func = kdb_bt,
+ .cmd_usage = "<vaddr>",
+ .cmd_help = "Backtrace process given its struct task address",
+ .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .cmd_name = "env",
+ .cmd_func = kdb_env,
+ .cmd_usage = "",
+ .cmd_help = "Show environment variables",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "set",
+ .cmd_func = kdb_set,
+ .cmd_usage = "",
+ .cmd_help = "Set environment variables",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "help",
+ .cmd_func = kdb_help,
+ .cmd_usage = "",
+ .cmd_help = "Display Help Message",
+ .cmd_minlen = 1,
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "?",
+ .cmd_func = kdb_help,
+ .cmd_usage = "",
+ .cmd_help = "Display Help Message",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "cpu",
+ .cmd_func = kdb_cpu,
+ .cmd_usage = "<cpunum>",
+ .cmd_help = "Switch to new cpu",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .cmd_name = "kgdb",
+ .cmd_func = kdb_kgdb,
+ .cmd_usage = "",
+ .cmd_help = "Enter kgdb mode",
+ .cmd_flags = 0,
+ },
+ { .cmd_name = "ps",
+ .cmd_func = kdb_ps,
+ .cmd_usage = "[<flags>|A]",
+ .cmd_help = "Display active task list",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "pid",
+ .cmd_func = kdb_pid,
+ .cmd_usage = "<pidnum>",
+ .cmd_help = "Switch to another task",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
+ { .cmd_name = "reboot",
+ .cmd_func = kdb_reboot,
+ .cmd_usage = "",
+ .cmd_help = "Reboot the machine immediately",
+ .cmd_flags = KDB_ENABLE_REBOOT,
+ },
#if defined(CONFIG_MODULES)
- kdb_register_flags("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0,
- KDB_ENABLE_INSPECT);
+ { .cmd_name = "lsmod",
+ .cmd_func = kdb_lsmod,
+ .cmd_usage = "",
+ .cmd_help = "List loaded kernel modules",
+ .cmd_flags = KDB_ENABLE_INSPECT,
+ },
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_flags("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "sr",
+ .cmd_func = kdb_sr,
+ .cmd_usage = "<key>",
+ .cmd_help = "Magic SysRq key",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "dmesg",
+ .cmd_func = kdb_dmesg,
+ .cmd_usage = "[lines]",
+ .cmd_help = "Display syslog buffer",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
- if (arch_kgdb_ops.enable_nmi) {
- kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- }
- kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0,
- KDB_ENABLE_SIGNAL);
- kdb_register_flags("summary", kdb_summary, "",
- "Summarize the system", 4,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .cmd_name = "defcmd",
+ .cmd_func = kdb_defcmd,
+ .cmd_usage = "name \"usage\" \"help\"",
+ .cmd_help = "Define a set of commands, down to endefcmd",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "kill",
+ .cmd_func = kdb_kill,
+ .cmd_usage = "<-signal> <pid>",
+ .cmd_help = "Send a signal to a process",
+ .cmd_flags = KDB_ENABLE_SIGNAL,
+ },
+ { .cmd_name = "summary",
+ .cmd_func = kdb_summary,
+ .cmd_usage = "",
+ .cmd_help = "Summarize the system",
+ .cmd_minlen = 4,
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .cmd_name = "per_cpu",
+ .cmd_func = kdb_per_cpu,
+ .cmd_usage = "<sym> [<bytes>] [<cpu>]",
+ .cmd_help = "Display per_cpu variables",
+ .cmd_minlen = 3,
+ .cmd_flags = KDB_ENABLE_MEM_READ,
+ },
+ { .cmd_name = "grephelp",
+ .cmd_func = kdb_grep_help,
+ .cmd_usage = "",
+ .cmd_help = "Display help on | grep",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+};
+
+static kdbtab_t nmicmd = {
+ .cmd_name = "disable_nmi",
+ .cmd_func = kdb_disable_nmi,
+ .cmd_usage = "",
+ .cmd_help = "Disable NMI entry to KDB",
+ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ kdb_register_table(maintab, ARRAY_SIZE(maintab));
+ if (arch_kgdb_ops.enable_nmi)
+ kdb_register_table(&nmicmd, 1);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 6cb92f7bbbd0..ccbed9089808 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -174,8 +174,11 @@ typedef struct _kdbtab {
short cmd_minlen; /* Minimum legal # command
* chars required */
kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
+ struct list_head list_node; /* Command list */
+ bool is_dynamic; /* Command table allocation type */
} kdbtab_t;
+extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
/* KDB breakpoint management functions */
@@ -207,9 +210,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
unsigned long mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig(struct task_struct *p, int sig);
-extern void kdb_meminfo_proc_show(void);
extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index f7c1885abeb6..91bb666d7c03 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -654,24 +654,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
return (mask & kdb_task_state_string(state)) != 0;
}
-/*
- * kdb_print_nameval - Print a name and its value, converting the
- * value to a symbol lookup if possible.
- * Inputs:
- * name field name to print
- * val value of field
- */
-void kdb_print_nameval(const char *name, unsigned long val)
-{
- kdb_symtab_t symtab;
- kdb_printf(" %-11.11s ", name);
- if (kdbnearsym(val, &symtab))
- kdb_symbol_print(val, &symtab,
- KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
- else
- kdb_printf("0x%lx\n", val);
-}
-
/* Last ditch allocator for debugging, so we can still debug even when
* the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
* for space usage, not for speed. One smallish memory pool, the free
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 8442e5c9cfa2..a0b3b04fb596 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -341,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
- * assume that it is the first interupt and eventually claim
+ * assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
@@ -422,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMT_DYNAMIC
+#ifdef CONFIG_PREEMPT_DYNAMIC
static_call(irqentry_exit_cond_resched)();
#else
irqentry_exit_cond_resched();
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index 8743150db2ac..c466c7fbdece 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -70,7 +70,9 @@ struct gcov_fn_info {
u32 ident;
u32 checksum;
+#if CONFIG_CLANG_VERSION < 110000
u8 use_extra_checksum;
+#endif
u32 cfg_checksum;
u32 num_counters;
@@ -145,10 +147,8 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
list_add_tail(&info->head, &current_info->functions);
}
-EXPORT_SYMBOL(llvm_gcda_emit_function);
#else
-void llvm_gcda_emit_function(u32 ident, u32 func_checksum,
- u8 use_extra_checksum, u32 cfg_checksum)
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
{
struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -158,12 +158,11 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum,
INIT_LIST_HEAD(&info->head);
info->ident = ident;
info->checksum = func_checksum;
- info->use_extra_checksum = use_extra_checksum;
info->cfg_checksum = cfg_checksum;
list_add_tail(&info->head, &current_info->functions);
}
-EXPORT_SYMBOL(llvm_gcda_emit_function);
#endif
+EXPORT_SYMBOL(llvm_gcda_emit_function);
void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
{
@@ -293,11 +292,16 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
!list_is_last(&fn_ptr2->head, &info2->functions)) {
if (fn_ptr1->checksum != fn_ptr2->checksum)
return false;
+#if CONFIG_CLANG_VERSION < 110000
if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
return false;
if (fn_ptr1->use_extra_checksum &&
fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
return false;
+#else
+ if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+#endif
fn_ptr1 = list_next_entry(fn_ptr1, head);
fn_ptr2 = list_next_entry(fn_ptr2, head);
}
@@ -529,17 +533,22 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
list_for_each_entry(fi_ptr, &info->functions, head) {
u32 i;
- u32 len = 2;
-
- if (fi_ptr->use_extra_checksum)
- len++;
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, len);
+#if CONFIG_CLANG_VERSION < 110000
+ pos += store_gcov_u32(buffer, pos,
+ fi_ptr->use_extra_checksum ? 3 : 2);
+#else
+ pos += store_gcov_u32(buffer, pos, 3);
+#endif
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+#if CONFIG_CLANG_VERSION < 110000
if (fi_ptr->use_extra_checksum)
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#else
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+#endif
pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6d89e33fe3aa..8cc8e5713287 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -761,7 +761,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
- * Interrupt occures on the falling and/or rising edge of a hardware
+ * Interrupt occurs on the falling and/or rising edge of a hardware
* signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
@@ -808,7 +808,7 @@ void handle_edge_irq(struct irq_desc *desc)
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
- * Renable it, if it was not disabled in meantime.
+ * Reenable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
@@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
* @dest: The affinity mask to set
* @force: Flag to enforce setting (disable online checks)
*
- * Conditinal, as the underlying parent chip might not implement it.
+ * Conditional, as the underlying parent chip might not implement it.
*/
int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest, bool force)
@@ -1531,7 +1531,7 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
- * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * irq_chip_compose_msi_msg - Compose msi message for a irq chip
* @data: Pointer to interrupt specific data
* @msg: Pointer to the MSI message
*
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 0b0cdf206dc4..7fe6cffe7d0d 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -13,7 +13,7 @@
/*
* What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
+ * Each architecture has to answer this themselves.
*/
static void ack_bad(struct irq_data *data)
{
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 43e3d1be622c..52f11c791bf8 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -107,7 +107,7 @@ free_descs:
* @irq: linux irq number to be destroyed
* @dest: cpumask of cpus which should have the IPI removed
*
- * The IPIs allocated with irq_reserve_ipi() are retuerned to the system
+ * The IPIs allocated with irq_reserve_ipi() are returned to the system
* destroying all virqs associated with them.
*
* Return 0 on success or error code on failure.
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 40880c350b95..0cd02efa3a74 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -24,10 +24,6 @@ struct irq_sim_irq_ctx {
struct irq_sim_work_ctx *work_ctx;
};
-struct irq_sim_devres {
- struct irq_domain *domain;
-};
-
static void irq_sim_irqmask(struct irq_data *data)
{
struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
@@ -216,11 +212,11 @@ void irq_domain_remove_sim(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
-static void devm_irq_domain_release_sim(struct device *dev, void *res)
+static void devm_irq_domain_remove_sim(void *data)
{
- struct irq_sim_devres *this = res;
+ struct irq_domain *domain = data;
- irq_domain_remove_sim(this->domain);
+ irq_domain_remove_sim(domain);
}
/**
@@ -238,20 +234,17 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
- struct irq_sim_devres *dr;
+ struct irq_domain *domain;
+ int ret;
- dr = devres_alloc(devm_irq_domain_release_sim,
- sizeof(*dr), GFP_KERNEL);
- if (!dr)
- return ERR_PTR(-ENOMEM);
+ domain = irq_domain_create_sim(fwnode, num_irqs);
+ if (IS_ERR(domain))
+ return domain;
- dr->domain = irq_domain_create_sim(fwnode, num_irqs);
- if (IS_ERR(dr->domain)) {
- devres_free(dr);
- return dr->domain;
- }
+ ret = devm_add_action_or_reset(dev, devm_irq_domain_remove_sim, domain);
+ if (ret)
+ return ERR_PTR(ret);
- devres_add(dev, dr);
- return dr->domain;
+ return domain;
}
EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cc1a09406c6e..4a617d7312a4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -31,7 +31,7 @@ static int __init irq_affinity_setup(char *str)
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
- * bugreports caused by random comandline masks
+ * bugreports caused by random commandline masks
*/
cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
return 1;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..f42ef868efd3 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* @name: Optional user provided domain name
* @pa: Optional user-provided physical address
*
- * Allocate a struct irqchip_fwid, and return a poiner to the embedded
+ * Allocate a struct irqchip_fwid, and return a pointer to the embedded
* fwnode_handle (or NULL on failure).
*
* Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
@@ -665,7 +665,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL) {
@@ -703,41 +703,6 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
-/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
- */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
- irq_hw_number_t hwirq_base, int count)
-{
- struct device_node *of_node;
- int ret;
-
- of_node = irq_domain_get_of_node(domain);
- ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(of_node));
- if (unlikely(ret < 0))
- return ret;
-
- irq_domain_associate_many(domain, irq_base, hwirq_base, count);
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-
static int irq_domain_translate(struct irq_domain *d,
struct irq_fwspec *fwspec,
irq_hw_number_t *hwirq, unsigned int *type)
@@ -906,7 +871,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
{
struct irq_data *data;
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
@@ -1436,7 +1401,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
* The whole process to setup an IRQ has been split into two steps.
* The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
* descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
* resources. In this way, it's easier to rollback when failing to
* allocate resources.
*/
@@ -1694,12 +1659,10 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @domain: Domain below which interrupts must be allocated
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to allocate
* @arg: Allocation data (arch/domain specific)
- *
- * Check whether the domain has been setup recursive. If not allocate
- * through the parent domain.
*/
int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs,
@@ -1715,11 +1678,9 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @domain: Domain below which interrupts must be freed
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to free
- *
- * Check whether the domain has been setup recursive. If not free
- * through the parent domain.
*/
void irq_domain_free_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 21ea370fccda..4c14356543d9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -179,7 +179,7 @@ bool irq_can_set_affinity_usr(unsigned int irq)
/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affitnity changed
+ * @desc: irq descriptor which has affinity changed
*
* We just set IRQTF_AFFINITY and delegate the affinity setting
* to the interrupt thread itself. We can not call
@@ -326,7 +326,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
* If the interrupt is not yet activated, just store the affinity
* mask and do not call the chip driver at all. On activation the
* driver has to make sure anyway that the interrupt is in a
- * useable state so startup works.
+ * usable state so startup works.
*/
if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
irqd_is_activated(data) || !irqd_affinity_on_activate(data))
@@ -1054,7 +1054,7 @@ again:
* to IRQS_INPROGRESS and the irq line is masked forever.
*
* This also serializes the state of shared oneshot handlers
- * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * versus "desc->threads_oneshot |= action->thread_mask;" in
* irq_wake_thread(). See the comment there which explains the
* serialization.
*/
@@ -1157,7 +1157,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
/*
* Interrupts explicitly requested as threaded interrupts want to be
- * preemtible - many of them need to sleep and wait for slow busses to
+ * preemptible - many of them need to sleep and wait for slow busses to
* complete.
*/
static irqreturn_t irq_thread_fn(struct irq_desc *desc,
@@ -1697,7 +1697,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
- if (irq_settings_can_autoenable(desc)) {
+ if (!(new->flags & IRQF_NO_AUTOEN) &&
+ irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
} else {
/*
@@ -1912,7 +1913,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
/* Last action releases resources */
if (!desc->action) {
/*
- * Reaquire bus lock as irq_release_resources() might
+ * Reacquire bus lock as irq_release_resources() might
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
@@ -2090,10 +2091,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*
+ * Also shared interrupts do not go well with disabling auto enable.
+ * The sharing interrupt might request it while it's still disabled
+ * and then wait for interrupts forever.
+ *
* Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
* it cannot be set along with IRQF_NO_SUSPEND.
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
@@ -2249,7 +2255,8 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
desc = irq_to_desc(irq);
- if (!desc || irq_settings_can_autoenable(desc) ||
+ if (!desc || (irq_settings_can_autoenable(desc) &&
+ !(irqflags & IRQF_NO_AUTOEN)) ||
!irq_settings_can_request(desc) ||
WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
!irq_supports_nmi(desc))
@@ -2746,7 +2753,7 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be storeed
+ * @state: a pointer to a boolean where the state is to be stored
*
* This call snapshots the internal irqchip state of an
* interrupt, returning into @state the bit corresponding to
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 651a4ad6d711..578596e41cb6 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -337,15 +337,14 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
* irq_matrix_reserve - Reserve interrupts
* @m: Matrix pointer
*
- * This is merily a book keeping call. It increments the number of globally
+ * This is merely a book keeping call. It increments the number of globally
* reserved interrupt bits w/o actually allocating them. This allows to
* setup interrupt descriptors w/o assigning low level resources to it.
* The actual allocation happens when the interrupt gets activated.
*/
void irq_matrix_reserve(struct irq_matrix *m)
{
- if (m->global_reserved <= m->global_available &&
- m->global_reserved + 1 > m->global_available)
+ if (m->global_reserved == m->global_available)
pr_warn("Interrupt reservation exceeds available resources\n");
m->global_reserved++;
@@ -356,7 +355,7 @@ void irq_matrix_reserve(struct irq_matrix *m)
* irq_matrix_remove_reserved - Remove interrupt reservation
* @m: Matrix pointer
*
- * This is merily a book keeping call. It decrements the number of globally
+ * This is merely a book keeping call. It decrements the number of globally
* reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
* interrupt was never in use and a real vector allocated, which undid the
* reservation.
@@ -423,7 +422,9 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
return;
- clear_bit(bit, cm->alloc_map);
+ if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map)))
+ return;
+
cm->allocated--;
if(managed)
cm->managed_allocated--;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index def48589ea48..61ca924ef4b4 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,7 +7,7 @@
/**
* irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
- * @desc: Interrupt descpriptor to clean up
+ * @desc: Interrupt descriptor to clean up
* @force_clear: If set clear the move pending bit unconditionally.
* If not set, clear it only when the dying CPU is the
* last one in the pending mask.
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b338d622f26e..c41965e348b5 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -5,7 +5,7 @@
*
* This file is licensed under GPLv2.
*
- * This file contains common code to support Message Signalled Interrupt for
+ * This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
#include <linux/types.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 98138788cb04..7c5cd42df3b9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -144,7 +144,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
if (type)
@@ -238,7 +238,7 @@ static ssize_t default_affinity_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index bd1d85c610aa..0c46e9fe3a89 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -128,7 +128,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
if (!try_retrigger(desc))
err = irq_sw_resend(desc);
- /* If the retrigger was successfull, mark it with the REPLAY bit */
+ /* If the retrigger was successful, mark it with the REPLAY bit */
if (!err)
desc->istate |= IRQS_REPLAY;
return err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f865e5f4d382..c481d8458325 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -403,6 +403,10 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
desc->irqs_unhandled -= ok;
}
+ if (likely(!desc->irqs_unhandled))
+ return;
+
+ /* Now getting into unhandled irq detection */
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 773b6105c4ae..d309d6fbf5bd 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -84,7 +84,7 @@ void irq_timings_disable(void)
* 2. Log interval
*
* We saw the irq timings allow to compute the interval of the
- * occurrences for a specific interrupt. We can reasonibly assume the
+ * occurrences for a specific interrupt. We can reasonably assume the
* longer is the interval, the higher is the error for the next event
* and we can consider storing those interval values into an array
* where each slot in the array correspond to an interval at the power
@@ -416,7 +416,7 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
* Copy the content of the circular buffer into another buffer
* in order to linearize the buffer instead of dealing with
* wrapping indexes and shifted array which will be prone to
- * error and extremelly difficult to debug.
+ * error and extremely difficult to debug.
*/
for (i = 0; i < count; i++) {
int index = (start + i) & IRQ_TIMINGS_MASK;
@@ -485,7 +485,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
/*
* The interrupt triggered more than one second apart, that
- * ends the sequence as predictible for our purpose. In this
+ * ends the sequence as predictable for our purpose. In this
* case, assume we have the beginning of a sequence and the
* timestamp is the first value. As it is impossible to
* predict anything at this point, return.
@@ -514,7 +514,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
* If more than the array size interrupts happened during the
* last busy/idle cycle, the index wrapped up and we have to
* begin with the next element in the array which is the last one
- * in the sequence, otherwise it is a the index 0.
+ * in the sequence, otherwise it is at the index 0.
*
* - have an indication of the interrupts activity on this CPU
* (eg. irq/sec)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8043a90aa50e..c851ca0ed357 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,27 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
+#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
+/*
+ * LLVM appends a hash to static function names when ThinLTO and CFI are
+ * both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
+ * This causes confusion and potentially breaks user space tools, so we
+ * strip the suffix from expanded symbol names.
+ */
+static inline bool cleanup_symbol_name(char *s)
+{
+ char *res;
+
+ res = strrchr(s, '$');
+ if (res)
+ *res = '\0';
+
+ return res != NULL;
+}
+#else
+static inline bool cleanup_symbol_name(char *s) { return false; }
+#endif
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
@@ -173,6 +194,9 @@ unsigned long kallsyms_lookup_name(const char *name)
if (strcmp(namebuf, name) == 0)
return kallsyms_sym_address(i);
+
+ if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
+ return kallsyms_sym_address(i);
}
return module_kallsyms_lookup_name(name);
}
@@ -303,7 +327,9 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
- return namebuf;
+
+ ret = namebuf;
+ goto found;
}
/* See if it's in a module or a BPF JITed image. */
@@ -316,11 +342,16 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = ftrace_mod_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+found:
+ cleanup_symbol_name(namebuf);
return ret;
}
int lookup_symbol_name(unsigned long addr, char *symname)
{
+ int res;
+
symname[0] = '\0';
symname[KSYM_NAME_LEN - 1] = '\0';
@@ -331,15 +362,23 @@ int lookup_symbol_name(unsigned long addr, char *symname)
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
symname, KSYM_NAME_LEN);
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_name(addr, symname);
+ res = lookup_module_symbol_name(addr, symname);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(symname);
+ return 0;
}
int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name)
{
+ int res;
+
name[0] = '\0';
name[KSYM_NAME_LEN - 1] = '\0';
@@ -351,10 +390,16 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
kallsyms_expand_symbol(get_symbol_offset(pos),
name, KSYM_NAME_LEN);
modname[0] = '\0';
- return 0;
+ goto found;
}
/* See if it's in a module. */
- return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ res = lookup_module_symbol_attrs(addr, size, offset, modname, name);
+ if (res)
+ return res;
+
+found:
+ cleanup_symbol_name(name);
+ return 0;
}
/* Look up a kernel symbol and return it in a text buffer. */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1578973c5740..a1972eba2917 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -963,7 +963,8 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
+ WARN_ON_FUNCTION_MISMATCH(timer->function,
+ kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f6310f848f34..3a4beb9395c4 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,6 +9,7 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/tracehook.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
@@ -369,9 +370,7 @@ static void klp_send_signals(void)
* Send fake signal to all non-kthread tasks which are
* still not migrated.
*/
- spin_lock_irq(&task->sighand->siglock);
- signal_wake_up(task, 0);
- spin_unlock_irq(&task->sighand->siglock);
+ set_notify_signal(task);
}
}
read_unlock(&tasklist_lock);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c6d0c1dc6253..ef28a0b9cf1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -705,7 +705,7 @@ static void print_lock_name(struct lock_class *class)
printk(KERN_CONT " (");
__print_lock_name(class);
- printk(KERN_CONT "){%s}-{%hd:%hd}", usage,
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
class->wait_type_outer ?: class->wait_type_inner,
class->wait_type_inner);
}
@@ -930,7 +930,8 @@ static bool assign_lock_key(struct lockdep_map *lock)
/* Debug-check: all keys must be persistent! */
debug_locks_off();
pr_err("INFO: trying to register non-static key.\n");
- pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
pr_err("turning off the locking correctness validator.\n");
dump_stack();
return false;
@@ -1392,7 +1393,7 @@ static int add_lock_to_list(struct lock_class *this,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index de49f9e1c11b..ecb8662e7a4e 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -99,16 +99,16 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_STACK_TRACE_ENTRIES 262144UL
#define STACK_TRACE_HASH_SIZE 8192
#else
-#define MAX_LOCKDEP_ENTRIES 32768UL
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
-#define STACK_TRACE_HASH_SIZE 16384
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
#endif
/*
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 4786dd271b45..b94f3831e963 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
+ int cnts;
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
/* When no more readers or writers, set the locked flag */
do {
- atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
- } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) != _QW_WAITING);
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/module.c b/kernel/module.c
index 30479355ab85..20fb004e7d8d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2146,6 +2146,8 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+static void cfi_cleanup(struct module *mod);
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -2187,6 +2189,9 @@ static void free_module(struct module *mod)
synchronize_rcu();
mutex_unlock(&module_mutex);
+ /* Clean up CFI for the module. */
+ cfi_cleanup(mod);
+
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
@@ -3866,6 +3871,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
+static void cfi_init(struct module *mod);
+
/*
* Allocate and load the module: note that size of section 0 is always
* zero, and we rely on this for optional sections.
@@ -3997,6 +4004,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
flush_module_icache(mod);
+ /* Setup CFI for the module. */
+ cfi_init(mod);
+
/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
@@ -4070,6 +4080,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
+ cfi_cleanup(mod);
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
@@ -4415,6 +4426,38 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
#endif /* CONFIG_LIVEPATCH */
#endif /* CONFIG_KALLSYMS */
+static void cfi_init(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ initcall_t *init;
+ exitcall_t *exit;
+
+ rcu_read_lock_sched();
+ mod->cfi_check = (cfi_check_fn)
+ find_kallsyms_symbol_value(mod, "__cfi_check");
+ init = (initcall_t *)
+ find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
+ exit = (exitcall_t *)
+ find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
+ rcu_read_unlock_sched();
+
+ /* Fix init/exit functions to point to the CFI jump table */
+ if (init)
+ mod->init = *init;
+ if (exit)
+ mod->exit = *exit;
+
+ cfi_module_add(mod, module_addr_min);
+#endif
+}
+
+static void cfi_cleanup(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+ cfi_module_remove(mod, module_addr_min);
+#endif
+}
+
/* Maximum number of characters written by module_flags() */
#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9af5a50d3489..b29c8aca7486 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work)
goto out;
/*
- * If the wakeup occured for an unknown reason, wait to prevent the
+ * If the wakeup occurred for an unknown reason, wait to prevent the
* system from trying to suspend and waking up in a tight loop.
*/
if (final_count == initial_count)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d63560e1cf87..1a221dcb3c01 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -329,7 +329,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
/**
* Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
+ * Memory bitmap is a structure consisting of many linked lists of
* objects. The main list's elements are of type struct zone_bitmap
* and each of them corresonds to one zone. For each zone bitmap
* object there is a list of objects of type struct bm_block that
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 72e33054a2e1..bea3cb8afa11 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -884,7 +884,7 @@ out_clean:
* enough_swap - Make sure we have enough swap to save the image.
*
* Returns TRUE or FALSE after checking the total amount of swap
- * space avaiable from the resume partition.
+ * space available from the resume partition.
*/
static int enough_swap(unsigned int nr_pages)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 3a8fd491758c..51615c909b2f 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -12,8 +12,6 @@
#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-extern raw_spinlock_t logbuf_lock;
-
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -21,7 +19,6 @@ int vprintk_store(int facility, int level,
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
@@ -56,10 +53,8 @@ void defer_console_output(void);
#else
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
-
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 575a34b88936..421c35571797 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -262,7 +262,7 @@ static void __up_console_sem(unsigned long ip)
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
- * locked without the console sempahore held).
+ * locked without the console semaphore held).
*/
static int console_locked, console_suspended;
@@ -355,62 +355,50 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_RAW_SPINLOCK(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
+/* All 3 protected by @console_sem. */
/* the next printk record to write to the console */
static u64 console_seq;
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
+
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
#else
#define PREFIX_MAX 32
#endif
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+
+/* the maximum size of a formatted record (i.e. with prefix added per line) */
+#define CONSOLE_LOG_MAX 1024
+
+/* the maximum size allowed to be reserved for a record */
+#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -452,6 +440,31 @@ bool printk_percpu_data_ready(void)
return __printk_percpu_data_ready;
}
+/* Must be called under syslog_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
+{
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[0] = val;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+}
+
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
+{
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
+
+ do {
+ seq = raw_read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
+
+ return val;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -619,7 +632,7 @@ out:
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -719,27 +732,27 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
- logbuf_lock_irq();
- if (!prb_read_valid(prb, user->seq, r)) {
+ printk_safe_enter_irq();
+ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, user->seq, r));
+ prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
}
- if (r->info->seq != user->seq) {
+ if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
- user->seq = r->info->seq;
+ atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
@@ -748,8 +761,8 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
&r->text_buf[0], r->info->text_len,
&r->info->dev_info);
- user->seq = r->info->seq + 1;
- logbuf_unlock_irq();
+ atomic64_set(&user->seq, r->info->seq + 1);
+ printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -784,11 +797,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -796,16 +809,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->seq = clear_seq;
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->seq = prb_next_seq(prb);
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -820,15 +833,15 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
- if (prb_read_valid_info(prb, user->seq, &info, NULL)) {
+ printk_safe_enter_irq();
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
- if (info.seq != user->seq)
+ if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -861,9 +874,9 @@ static int devkmsg_open(struct inode *inode, struct file *file)
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- logbuf_lock_irq();
- user->seq = prb_first_valid_seq(prb);
- logbuf_unlock_irq();
+ printk_safe_enter_irq();
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
+ printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -955,6 +968,9 @@ void log_buf_vmcoreinfo_setup(void)
VMCOREINFO_SIZE(atomic_long_t);
VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -1421,6 +1437,50 @@ static size_t get_record_print_text_size(struct printk_info *info,
return ((prefix_len * line_count) + info->text_len + 1);
}
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
@@ -1428,19 +1488,21 @@ static int syslog_print(char __user *buf, int size)
char *text;
int len = 0;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
while (size > 0) {
size_t n;
size_t skip;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
break;
}
if (r.info->seq != syslog_seq) {
@@ -1469,7 +1531,8 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
if (!n)
break;
@@ -1492,34 +1555,26 @@ static int syslog_print(char __user *buf, int size)
static int syslog_print_all(char __user *buf, int size, bool clear)
{
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
char *text;
int len = 0;
u64 seq;
bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
- len += get_record_print_text_size(&info, line_count, true, time);
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
- /* move first record forward until length fits into the buffer */
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
- if (len <= size)
- break;
- len -= get_record_print_text_size(&info, line_count, true, time);
- }
-
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -1532,20 +1587,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
break;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
if (len < 0)
break;
}
- if (clear)
- clear_seq = seq;
- logbuf_unlock_irq();
+ if (clear) {
+ raw_spin_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ raw_spin_unlock(&syslog_lock);
+ }
+ printk_safe_exit_irq();
kfree(text);
return len;
@@ -1553,9 +1611,23 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- logbuf_lock_irq();
- clear_seq = prb_next_seq(prb);
- logbuf_unlock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
+}
+
+/* Return a consistent copy of @syslog_seq. */
+static u64 read_syslog_seq_irq(void)
+{
+ u64 seq;
+
+ raw_spin_lock_irq(&syslog_lock);
+ seq = syslog_seq;
+ raw_spin_unlock_irq(&syslog_lock);
+
+ return seq;
}
int do_syslog(int type, char __user *buf, int len, int source)
@@ -1581,8 +1653,9 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
+
error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, syslog_seq, NULL));
+ prb_read_valid(prb, read_syslog_seq_irq(), NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1630,10 +1703,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
+ printk_safe_enter_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
return 0;
}
if (info.seq != syslog_seq) {
@@ -1661,7 +1736,8 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- logbuf_unlock_irq();
+ raw_spin_unlock(&syslog_lock);
+ printk_safe_exit_irq();
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -2104,12 +2180,6 @@ asmlinkage int vprintk_emit(int facility, int level,
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_func(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
@@ -2143,7 +2213,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_func(fmt, args);
+ r = vprintk(fmt, args);
va_end(args);
return r;
@@ -2152,8 +2222,7 @@ EXPORT_SYMBOL(printk);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
+#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
@@ -2262,7 +2331,7 @@ static int __init console_setup(char *str)
/*
* console="" or console=null have been suggested as a way to
* disable console output. Use ttynull that has been created
- * for exacly this purpose.
+ * for exactly this purpose.
*/
if (str[0] == 0 || strcmp(str, "null") == 0) {
__add_preferred_console("ttynull", 0, NULL, NULL, true);
@@ -2471,7 +2540,7 @@ static inline int can_use_console(void)
void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
+ static char text[CONSOLE_LOG_MAX];
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
@@ -2518,7 +2587,6 @@ again:
size_t len;
printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2562,7 +2630,6 @@ skip:
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
console_seq++;
- raw_spin_unlock(&logbuf_lock);
/*
* While actively printing out messages, if another printk()
@@ -2589,8 +2656,6 @@ skip:
console_locked = 0;
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
/*
@@ -2599,9 +2664,7 @@ skip:
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- raw_spin_lock(&logbuf_lock);
retry = prb_read_valid(prb, console_seq, NULL);
- raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
@@ -2668,9 +2731,9 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
unsigned long flags;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
console_seq = prb_first_valid_seq(prb);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
}
console_unlock();
}
@@ -2898,9 +2961,7 @@ void register_console(struct console *newcon)
/*
* console_unlock(); will print out the buffered messages
* for us.
- */
- logbuf_lock_irqsave(flags);
- /*
+ *
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
* the already-registered consoles.
@@ -2911,8 +2972,11 @@ void register_console(struct console *newcon)
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+
+ /* Get a consistent copy of @syslog_seq. */
+ raw_spin_lock_irqsave(&syslog_lock, flags);
console_seq = syslog_seq;
- logbuf_unlock_irqrestore(flags);
+ raw_spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();
console_sysfs_notify();
@@ -3042,7 +3106,7 @@ void __init console_init(void)
*
* To mitigate this problem somewhat, only unregister consoles whose memory
* intersects with the init section. Note that all other boot consoles will
- * get unregistred when the real preferred console is registered.
+ * get unregistered when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
@@ -3276,7 +3340,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3293,26 +3356,15 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (reason > max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
-
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->next_seq = prb_next_seq(prb);
- logbuf_unlock_irqrestore(flags);
-
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
}
rcu_read_unlock();
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3326,30 +3378,31 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
unsigned int line_count;
struct printk_record r;
+ unsigned long flags;
size_t l = 0;
bool ret = false;
- prb_rec_init_rd(&r, &info, line, size);
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
- if (!dumper->active)
- goto out;
+ printk_safe_enter_irqsave(flags);
+ prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
if (line) {
- if (!prb_read_valid(prb, dumper->cur_seq, &r))
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
goto out;
l = record_print_text(&r, syslog, printk_time);
} else {
- if (!prb_read_valid_info(prb, dumper->cur_seq,
+ if (!prb_read_valid_info(prb, iter->cur_seq,
&info, &line_count)) {
goto out;
}
@@ -3358,52 +3411,23 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
}
- dumper->cur_seq = r.info->seq + 1;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
+ printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
* with as many of the *youngest* kmsg records that fit into it.
@@ -3416,115 +3440,93 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
+ char *buf, size_t size, size_t *len_out)
{
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
unsigned long flags;
u64 seq;
u64 next_seq;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
bool time = printk_time;
- prb_rec_init_rd(&r, &info, buf, size);
-
- if (!dumper->active || !buf || !size)
+ if (!buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
- if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) {
- if (info.seq != dumper->cur_seq) {
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
+
+ printk_safe_enter_irqsave(flags);
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
- dumper->cur_seq = info.seq;
+ iter->cur_seq = info.seq;
}
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq) {
+ printk_safe_exit_irqrestore(flags);
goto out;
}
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- while (prb_read_valid_info(prb, seq, &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
- break;
- l += get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- while (l >= size && prb_read_valid_info(prb, seq,
- &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
- break;
- l -= get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
- }
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
+ */
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
+ size - 1, syslog, time);
- /* last message in next interation */
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
next_seq = seq;
- /* actually read text into the buffer now */
- l = 0;
- while (prb_read_valid(prb, seq, &r)) {
- if (r.info->seq >= dumper->next_seq)
- break;
+ prb_rec_init_rd(&r, &info, buf, size);
- l += record_print_text(&r, syslog, time);
+ len = 0;
+ prb_for_each_record(seq, prb, seq, &r) {
+ if (r.info->seq >= iter->next_seq)
+ break;
- /* adjust record to store to remaining buffer space */
- prb_rec_init_rd(&r, &info, buf + l, size - l);
+ len += record_print_text(&r, syslog, time);
- seq = r.info->seq + 1;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
+ iter->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
- */
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
-{
- dumper->cur_seq = clear_seq;
- dumper->next_seq = prb_next_seq(prb);
-}
-
-/**
* kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
unsigned long flags;
- logbuf_lock_irqsave(flags);
- kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_enter_irqsave(flags);
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
+ printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 2e9e3ed7d63e..7a1414622051 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -16,7 +16,7 @@
#include "internal.h"
/*
- * printk() could not take logbuf_lock in NMI context. Instead,
+ * In NMI and safe mode, printk() avoids taking locks. Instead,
* it uses an alternative implementation that temporary stores
* the strings into a per-CPU buffer. The content of the buffer
* is later flushed into the main ring buffer via IRQ work.
@@ -267,17 +267,9 @@ void printk_safe_flush(void)
void printk_safe_flush_on_panic(void)
{
/*
- * Make sure that we could access the main ring buffer.
+ * Make sure that we could access the safe buffers.
* Do not risk a double release when more CPUs are up.
*/
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
if (raw_spin_is_locked(&safe_read_lock)) {
if (num_online_cpus() > 1)
return;
@@ -319,9 +311,7 @@ void noinstr printk_nmi_exit(void)
* reordering.
*
* It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
+ * will store the messages into the main logbuf directly.
*/
void printk_nmi_direct_enter(void)
{
@@ -367,7 +357,7 @@ void __printk_safe_exit(void)
this_cpu_dec(printk_context);
}
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
/* Allow to pass printk() to kdb but avoid a recursion. */
@@ -376,20 +366,21 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
#endif
/*
- * Try to use the main logbuf even in NMI. But avoid calling console
+ * Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
+ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
+ unsigned long flags;
int len;
+ printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- raw_spin_unlock(&logbuf_lock);
+ printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
+ /* Use extra buffer in NMI. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
@@ -420,3 +411,4 @@ void __init printk_safe_init(void)
/* Flush pending messages that did not have scheduled IRQ works. */
printk_safe_flush();
}
+EXPORT_SYMBOL(vprintk);
diff --git a/kernel/profile.c b/kernel/profile.c
index 6f69a4195d56..c2ebddb5e974 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -430,7 +430,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..b2890f6e6d6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6384,6 +6384,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
{
return __sched_setscheduler(p, attr, false, true);
}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 50cbad89f7fa..6ee9c9bbe505 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -114,19 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
return true;
}
-static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
{
- if (sugov_update_next_freq(sg_policy, time, next_freq))
- cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
-}
-
-static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
-{
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
-
if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
@@ -366,16 +355,19 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
sg_policy->cached_raw_freq = cached_freq;
}
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ return;
+
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
raw_spin_unlock(&sg_policy->update_lock);
}
}
@@ -454,12 +446,15 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ goto unlock;
+
if (sg_policy->policy->fast_switch_enabled)
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
else
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
}
-
+unlock:
raw_spin_unlock(&sg_policy->update_lock);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5f611658eeab..2c36a5fad589 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- pc = preempt_count() - offset;
+ pc = irq_count() - offset;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struct *prev)
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- unsigned int pc = preempt_count() - offset;
+ unsigned int pc = irq_count() - offset;
if (pc & HARDIRQ_OFFSET) {
vtime_account_hardirq(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1d60fc2c9987..1e63db4dbd9a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -817,7 +817,7 @@ static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
}
/**
- * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * seccomp_cache_prepare - emulate the filter to find cacheable syscalls
* @sfilter: The seccomp filter
*
* Returns 0 if successful or -errno if error occurred.
diff --git a/kernel/signal.c b/kernel/signal.c
index f2718350bf4b..e528f96eebc8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -43,7 +43,6 @@
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
-#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
@@ -181,8 +180,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (!recalc_sigpending_tsk(current) && !freezing(current) &&
- !klp_patch_pending(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bad14ca2b520..4992853ef53d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -13,6 +13,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/local_lock.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
@@ -25,6 +26,7 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/wait_bit.h>
#include <asm/softirq_stack.h>
@@ -102,20 +104,204 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
#endif
/*
- * preempt_count and SOFTIRQ_OFFSET usage:
- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
- * softirq processing.
- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ * processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
+ *
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
+ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a
+ * softirq disabled section to be preempted.
+ *
+ * The per task counter is used for softirq_count(), in_softirq() and
+ * in_serving_softirqs() because these counts are only valid when the task
+ * holding softirq_ctrl::lock is running.
+ *
+ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that
+ * the task which is in a softirq disabled section is preempted or blocks.
+ */
+struct softirq_ctrl {
+ local_lock_t lock;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
+};
+
+/**
+ * local_bh_blocked() - Check for idle whether BH processing is blocked
+ *
+ * Returns false if the per CPU softirq::cnt is 0 otherwise true.
+ *
+ * This is invoked from the idle task to guard against false positive
+ * softirq pending warnings, which would happen when the task which holds
+ * softirq_ctrl::lock was the only running task on the CPU and blocks on
+ * some other lock.
+ */
+bool local_bh_blocked(void)
+{
+ return __this_cpu_read(softirq_ctrl.cnt) != 0;
+}
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long flags;
+ int newcnt;
+
+ WARN_ON_ONCE(in_hardirq());
+
+ /* First entry of a task into a BH disabled section? */
+ if (!current->softirq_disable_cnt) {
+ if (preemptible()) {
+ local_lock(&softirq_ctrl.lock);
+ /* Required to meet the RCU bottomhalf requirements. */
+ rcu_read_lock();
+ } else {
+ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+ }
+ }
+
+ /*
+ * Track the per CPU softirq disabled state. On RT this is per CPU
+ * state to allow preemption of bottom half disabled sections.
+ */
+ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void __local_bh_enable(unsigned int cnt, bool unlock)
+{
+ unsigned long flags;
+ int newcnt;
+
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_on(_RET_IP_);
+ raw_local_irq_restore(flags);
+ }
+
+ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
+
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ bool preempt_on = preemptible();
+ unsigned long flags;
+ u32 pending;
+ int curcnt;
+
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
+
+ local_irq_save(flags);
+ curcnt = __this_cpu_read(softirq_ctrl.cnt);
+
+ /*
+ * If this is not reenabling soft interrupts, no point in trying to
+ * run pending ones.
+ */
+ if (curcnt != cnt)
+ goto out;
+
+ pending = local_softirq_pending();
+ if (!pending || ksoftirqd_running(pending))
+ goto out;
+
+ /*
+ * If this was called from non preemptible context, wake up the
+ * softirq daemon.
+ */
+ if (!preempt_on) {
+ wakeup_softirqd();
+ goto out;
+ }
+
+ /*
+ * Adjust softirq count to SOFTIRQ_OFFSET which makes
+ * in_serving_softirq() become true.
+ */
+ cnt = SOFTIRQ_OFFSET;
+ __local_bh_enable(cnt, false);
+ __do_softirq();
+
+out:
+ __local_bh_enable(cnt, preempt_on);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+/*
+ * Invoked from ksoftirqd_run() outside of the interrupt disabled section
+ * to acquire the per CPU local lock for reentrancy protection.
+ */
+static inline void ksoftirqd_run_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ local_irq_disable();
+}
+
+/* Counterpart to ksoftirqd_run_begin() */
+static inline void ksoftirqd_run_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET, true);
+ WARN_ON_ONCE(in_interrupt());
+ local_irq_enable();
+}
+
+static inline void softirq_handle_begin(void) { }
+static inline void softirq_handle_end(void) { }
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+static inline void invoke_softirq(void)
+{
+ if (should_wake_ksoftirqd())
+ wakeup_softirqd();
+}
+
+#else /* CONFIG_PREEMPT_RT */
-#ifdef CONFIG_TRACE_IRQFLAGS
/*
- * This is for softirq.c-internal use, where hardirqs are disabled
+ * This one is for softirq.c-internal use, where hardirqs are disabled
* legitimately:
*/
+#ifdef CONFIG_TRACE_IRQFLAGS
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -206,6 +392,32 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
}
EXPORT_SYMBOL(__local_bh_enable_ip);
+static inline void softirq_handle_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+}
+
+static inline void softirq_handle_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+}
+
+static inline void ksoftirqd_run_begin(void)
+{
+ local_irq_disable();
+}
+
+static inline void ksoftirqd_run_end(void)
+{
+ local_irq_enable();
+}
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return true;
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -250,6 +462,8 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
}
+#endif /* !CONFIG_PREEMPT_RT */
+
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
* but break the loop if need_resched() is set or after 2 ms.
@@ -318,7 +532,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending = local_softirq_pending();
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
account_softirq_enter(current);
@@ -354,8 +568,10 @@ restart:
pending >>= softirq_bit;
}
- if (__this_cpu_read(ksoftirqd) == current)
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ __this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
+
local_irq_disable();
pending = local_softirq_pending();
@@ -369,8 +585,7 @@ restart:
account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- __local_bh_enable(SOFTIRQ_OFFSET);
- WARN_ON_ONCE(in_interrupt());
+ softirq_handle_end();
current_restore_flags(old_flags, PF_MEMALLOC);
}
@@ -465,7 +680,7 @@ inline void raise_softirq_irqoff(unsigned int nr)
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
- if (!in_interrupt())
+ if (!in_interrupt() && should_wake_ksoftirqd())
wakeup_softirqd();
}
@@ -531,6 +746,20 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
+static bool tasklet_clear_sched(struct tasklet_struct *t)
+{
+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) {
+ wake_up_var(&t->state);
+ return true;
+ }
+
+ WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n",
+ t->use_callback ? "callback" : "func",
+ t->use_callback ? (void *)t->callback : (void *)t->func);
+
+ return false;
+}
+
static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
@@ -550,13 +779,12 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- if (t->use_callback)
- t->callback(t);
- else
- t->func(t->data);
+ if (tasklet_clear_sched(t)) {
+ if (t->use_callback)
+ t->callback(t);
+ else
+ t->func(t->data);
+ }
tasklet_unlock(t);
continue;
}
@@ -606,21 +834,62 @@ void tasklet_init(struct tasklet_struct *t,
}
EXPORT_SYMBOL(tasklet_init);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * Prevent a live lock when current preempted soft
+ * interrupt processing or prevents ksoftirqd from
+ * running. If the tasklet runs on a different CPU
+ * then this has no effect other than doing the BH
+ * disable/enable dance for nothing.
+ */
+ local_bh_disable();
+ local_bh_enable();
+ } else {
+ cpu_relax();
+ }
+ }
+}
+EXPORT_SYMBOL(tasklet_unlock_spin_wait);
+#endif
+
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
pr_notice("Attempt to kill tasklet from interrupt\n");
- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- do {
- yield();
- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
- }
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state));
+
tasklet_unlock_wait(t);
- clear_bit(TASKLET_STATE_SCHED, &t->state);
+ tasklet_clear_sched(t);
}
EXPORT_SYMBOL(tasklet_kill);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+void tasklet_unlock(struct tasklet_struct *t)
+{
+ smp_mb__before_atomic();
+ clear_bit(TASKLET_STATE_RUN, &t->state);
+ smp_mb__after_atomic();
+ wake_up_var(&t->state);
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock);
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state));
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock_wait);
+#endif
+
void __init softirq_init(void)
{
int cpu;
@@ -643,53 +912,21 @@ static int ksoftirqd_should_run(unsigned int cpu)
static void run_ksoftirqd(unsigned int cpu)
{
- local_irq_disable();
+ ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
- local_irq_enable();
+ ksoftirqd_run_end();
cond_resched();
return;
}
- local_irq_enable();
+ ksoftirqd_run_end();
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
- struct tasklet_struct **i;
-
- BUG_ON(cpu_online(cpu));
- BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
-
- if (!test_bit(TASKLET_STATE_SCHED, &t->state))
- return;
-
- /* CPU is dead, so no lock needed. */
- for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
- if (*i == t) {
- *i = t->next;
- /* If this was the tail element, move the tail ptr */
- if (*i == NULL)
- per_cpu(tasklet_vec, cpu).tail = i;
- return;
- }
- }
- BUG();
-}
-
static int takeover_tasklets(unsigned int cpu)
{
/* CPU is dead, so no lock needed. */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2e2e3f378d97..3d62c9599dc0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -119,6 +119,12 @@
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef PAC_SET_ENABLED_KEYS
+# define PAC_SET_ENABLED_KEYS(a, b, c) (-EINVAL)
+#endif
+#ifndef PAC_GET_ENABLED_KEYS
+# define PAC_GET_ENABLED_KEYS(a) (-EINVAL)
+#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
#endif
@@ -2497,6 +2503,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_PAC_SET_ENABLED_KEYS:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
+ break;
+ case PR_PAC_GET_ENABLED_KEYS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = PAC_GET_ENABLED_KEYS(me);
+ break;
case PR_SET_TAGGED_ADDR_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4d94e2b5499d..bea9d08b1698 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -2,13 +2,13 @@
/*
* Alarmtimer interface
*
- * This interface provides a timer which is similarto hrtimers,
+ * This interface provides a timer which is similar to hrtimers,
* but triggers a RTC alarm if the box is suspend.
*
* This interface is influenced by the Android RTC Alarm timer
* interface.
*
- * Copyright (C) 2010 IBM Corperation
+ * Copyright (C) 2010 IBM Corporation
*
* Author: John Stultz <john.stultz@linaro.org>
*/
@@ -811,7 +811,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
/**
* alarm_timer_nsleep - alarmtimer nanosleep
* @which_clock: clockid
- * @flags: determins abstime or relative
+ * @flags: determines abstime or relative
* @tsreq: requested sleep time (abs or rel)
*
* Handles clock_nanosleep calls against _ALARM clockids
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cce484a2cc7c..1d1a61371b5a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -38,7 +38,7 @@
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
* multiplied with the calculated mult factor. Larger ranges may
- * reduce the conversion accuracy by chosing smaller mult and shift
+ * reduce the conversion accuracy by choosing smaller mult and shift
* factors.
*/
void
@@ -518,7 +518,7 @@ static void clocksource_suspend_select(bool fallback)
* the suspend time when resuming system.
*
* This function is called late in the suspend process from timekeeping_suspend(),
- * that means processes are freezed, non-boot cpus and interrupts are disabled
+ * that means processes are frozen, non-boot cpus and interrupts are disabled
* now. It is therefore possible to start the suspend timer without taking the
* clocksource mutex.
*/
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..4a66725b1d4a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -683,7 +683,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* T1 is removed, so this code is called and would reprogram
* the hardware to 5s from now. Any hrtimer_start after that
* will not reprogram the hardware due to hang_detected being
- * set. So we'd effectivly block all timers until the T2 event
+ * set. So we'd effectively block all timers until the T2 event
* fires.
*/
if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
@@ -1019,7 +1019,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
* cpu_base->next_timer. This happens when we remove the first
* timer on a remote cpu. No harm as we never dereference
* cpu_base->next_timer. So the worst thing what can happen is
- * an superflous call to hrtimer_force_reprogram() on the
+ * an superfluous call to hrtimer_force_reprogram() on the
* remote cpu later on if the same timer gets enqueued again.
*/
if (reprogram && timer == cpu_base->next_timer)
@@ -1212,7 +1212,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* The counterpart to hrtimer_cancel_wait_running().
*
* If there is a waiter for cpu_base->expiry_lock, then it was waiting for
- * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
@@ -1398,7 +1398,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
int base;
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context for latency reasons and because the callbacks
* can invoke functions which might sleep on RT, e.g. spin_lock().
@@ -1430,7 +1430,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: The modes which are relevant for intitialization:
+ * @mode: The modes which are relevant for initialization:
* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
* HRTIMER_MODE_REL_SOFT
*
@@ -1487,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* insufficient for that.
*
* The sequence numbers are required because otherwise we could still observe
- * a false negative if the read side got smeared over multiple consequtive
+ * a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
@@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
* minimizing wakeups, not running timers at the
* earliest interrupt after their soft expiration.
* This allows us to avoid using a Priority Search
- * Tree, which can answer a stabbing querry for
+ * Tree, which can answer a stabbing query for
* overlapping intervals and instead use the simple
* BST we already have.
* We don't add extra wakeups by delaying timers that
@@ -1822,7 +1822,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode)
{
/*
- * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context either for latency reasons or because the
* hrtimer callback takes regular spinlocks or invokes other
@@ -1835,7 +1835,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* the same CPU. That causes a latency spike due to the wakeup of
* a gazillion threads.
*
- * OTOH, priviledged real-time user space applications rely on the
+ * OTOH, privileged real-time user space applications rely on the
* low latency of hard interrupt wakeups. If the current task is in
* a real-time scheduling class, mark the mode for hard interrupt
* expiry.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a5cffe2a1770..a492e4da69ba 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -44,7 +44,7 @@ static u64 jiffies_read(struct clocksource *cs)
* the timer interrupt frequency HZ and it suffers
* inaccuracies caused by missed or lost timer
* interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
+ * interrupt hardware to accurately tick at the
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5247afd7f345..406dccb79c2b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -544,7 +544,7 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
struct timespec64 *to_set,
const struct timespec64 *now)
{
- /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+ /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */
const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
struct timespec64 delay = {.tv_sec = -1,
.tv_nsec = set_offset_nsec};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 9abe15255bc4..3bb96a8b49c9 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -279,7 +279,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
* @tsk: Task for which cputime needs to be started
* @samples: Storage for time samples
*
- * The thread group cputime accouting is avoided when there are no posix
+ * The thread group cputime accounting is avoided when there are no posix
* CPU timers armed. Before starting a timer it's required to check whether
* the time accounting is active. If not, a full update of the atomic
* accounting store needs to be done and the accounting enabled.
@@ -390,7 +390,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
/*
* If posix timer expiry is handled in task work context then
* timer::it_lock can be taken without disabling interrupts as all
- * other locking happens in task context. This requires a seperate
+ * other locking happens in task context. This requires a separate
* lock class key otherwise regular posix timer expiry would record
* the lock class being taken in interrupt context and generate a
* false positive warning.
@@ -1216,7 +1216,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
check_process_timers(tsk, &firing);
/*
- * The above timer checks have updated the exipry cache and
+ * The above timer checks have updated the expiry cache and
* because nothing can have queued or modified timers after
* sighand lock was taken above it is guaranteed to be
* consistent. So the next timer interrupt fastpath check
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bf540f5a4115..dd5697d7347b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1191,8 +1191,8 @@ SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
err = do_clock_adjtime(which_clock, &ktx);
- if (err >= 0)
- err = put_old_timex32(utp, &ktx);
+ if (err >= 0 && put_old_timex32(utp, &ktx))
+ return -EFAULT;
return err;
}
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 77c63005dc4e..13b11eb62685 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -21,7 +21,6 @@
#define DEBUGFS_FILENAME "udelay_test"
static DEFINE_MUTEX(udelay_test_lock);
-static struct dentry *udelay_test_debugfs_file;
static int udelay_test_usecs;
static int udelay_test_iterations = DEFAULT_ITERATIONS;
@@ -138,8 +137,8 @@ static const struct file_operations udelay_test_debugfs_ops = {
static int __init udelay_test_init(void)
{
mutex_lock(&udelay_test_lock);
- udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
- S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ debugfs_create_file(DEBUGFS_FILENAME, S_IRUSR, NULL, NULL,
+ &udelay_test_debugfs_ops);
mutex_unlock(&udelay_test_lock);
return 0;
@@ -150,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(udelay_test_debugfs_file);
+ debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index b5a65e212df2..797eb93103ad 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -53,7 +53,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* reasons.
*
* Each caller tries to arm the hrtimer on its own CPU, but if the
- * hrtimer callbback function is currently running, then
+ * hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
*
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5a23829372c7..a44055228796 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -107,6 +107,19 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(dev);
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * If the system already runs in oneshot mode, switch the newly
+ * registered broadcast device to oneshot mode explicitly.
+ */
+ if (tick_broadcast_oneshot_active()) {
+ tick_broadcast_switch_to_oneshot();
+ return;
+ }
+
/*
* Inform all cpus about this. We might be in a situation
* where we did not switch to oneshot mode because the per cpu
@@ -115,8 +128,7 @@ void tick_install_broadcast_device(struct clock_event_device *dev)
* notification the systems stays stuck in periodic mode
* forever.
*/
- if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
- tick_clock_notify();
+ tick_clock_notify();
}
/*
@@ -157,7 +169,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
}
/*
- * Check, if the device is disfunctional and a place holder, which
+ * Check, if the device is dysfunctional and a placeholder, which
* needs to be handled by the broadcast device.
*/
int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
@@ -391,7 +403,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
* - the broadcast device exists
* - the broadcast device is not a hrtimer based one
* - the broadcast device is in periodic mode to
- * avoid a hickup during switch to oneshot mode
+ * avoid a hiccup during switch to oneshot mode
*/
if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 9d3a22510bab..e15bc0ef1912 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -348,12 +348,7 @@ void tick_check_new_device(struct clock_event_device *newdev)
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
- /* cpu local device ? */
- if (!tick_check_percpu(curdev, newdev, cpu))
- goto out_bc;
-
- /* Preference decision */
- if (!tick_check_preferred(curdev, newdev))
+ if (!tick_check_replacement(curdev, newdev))
goto out_bc;
if (!try_module_get(newdev->owner))
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f9745d47425a..475ecceda768 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force)
}
/**
- * tick_resume_onshot - resume oneshot mode
+ * tick_resume_oneshot - resume oneshot mode
*/
void tick_resume_oneshot(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e10a4af88737..828b091501ca 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,7 +751,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* Aside of that check whether the local timer softirq is
* pending. If so its a bad idea to call get_next_timer_interrupt()
* because there is an already expired timer, so it will request
- * immeditate expiry, which rearms the hardware timer with a
+ * immediate expiry, which rearms the hardware timer with a
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
@@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(local_softirq_pending())) {
static int ratelimit;
- if (ratelimit < 10 &&
+ if (ratelimit < 10 && !local_bh_blocked() &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
(unsigned int) local_softirq_pending());
@@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
*
- * Called from power state control code with interrupts disabled
+ * Called from power state control code with interrupts disabled.
+ *
+ * The return value of this function and/or the value returned by it through the
+ * @delta_next pointer can be negative which must be taken into account by its
+ * callers.
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 4fb06527cf64..d952ae393423 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -29,7 +29,7 @@ enum tick_nohz_mode {
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
- * it is resetted during irq handling phases.
+ * it is reset during irq handling phases.
* @do_timer_lst: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
* @last_tick: Store the last tick expiry time when the tick
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3985b2b32d08..29923b20e0e4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -571,7 +571,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
- * resolution values don't fall on second boundries. I.e. the line:
+ * resolution values don't fall on second boundaries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
* Note that due to the small error in the multiplier here, this
* rounding is incorrect for sufficiently large values of tv_nsec, but
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 85b98e727306..e6285288d765 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -76,7 +76,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
return ns;
}
-u64 timecounter_cyc2time(struct timecounter *tc,
+u64 timecounter_cyc2time(const struct timecounter *tc,
u64 cycle_tstamp)
{
u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..81fe2a33b80c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,14 +596,14 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
* careful cache layout of the timekeeper because the sequence count and
* struct tk_read_base would then need two cache lines instead of one.
*
- * Access to the time keeper clock source is disabled accross the innermost
+ * Access to the time keeper clock source is disabled across the innermost
* steps of suspend/resume. The accessors still work, but the timestamps
* are frozen until time keeping is resumed which happens very early.
*
* For regular suspend/resume there is no observable difference vs. sched
* clock, but it might affect some of the nasty low level debug printks.
*
- * OTOH, access to sched clock is not guaranteed accross suspend/resume on
+ * OTOH, access to sched clock is not guaranteed across suspend/resume on
* all systems either so it depends on the hardware in use.
*
* If that turns out to be a real problem then this could be mitigated by
@@ -899,7 +899,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
/**
- * ktime_mono_to_any() - convert mononotic time to any other time
+ * ktime_mono_to_any() - convert monotonic time to any other time
* @tmono: time to convert.
* @offs: which offset to use
*/
@@ -1427,35 +1427,45 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
static int change_clocksource(void *data)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *new, *old;
+ struct clocksource *new, *old = NULL;
unsigned long flags;
+ bool change = false;
new = (struct clocksource *) data;
- raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&tk_core.seq);
-
- timekeeping_forward_now(tk);
/*
* If the cs is in module, get a module reference. Succeeds
* for built-in code (owner == NULL) as well.
*/
if (try_module_get(new->owner)) {
- if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr_mono.clock;
- tk_setup_internals(tk, new);
- if (old->disable)
- old->disable(old);
- module_put(old->owner);
- } else {
+ if (!new->enable || new->enable(new) == 0)
+ change = true;
+ else
module_put(new->owner);
- }
}
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ if (change) {
+ old = tk->tkr_mono.clock;
+ tk_setup_internals(tk, new);
+ }
+
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (old) {
+ if (old->disable)
+ old->disable(old);
+
+ module_put(old->owner);
+ }
+
return 0;
}
@@ -1948,7 +1958,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us:
* xtime_nsec_2 = xtime_nsec_1 - offset
- * Which simplfies to:
+ * Which simplifies to:
* xtime_nsec -= offset
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
@@ -2336,7 +2346,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
/*
* Validate if a timespec/timeval used to inject a time
- * offset is valid. Offsets can be postive or negative, so
+ * offset is valid. Offsets can be positive or negative, so
* we don't check tv_sec. The value of the timeval/timespec
* is the sum of its fields,but *NOTE*:
* The field tv_usec/tv_nsec must always be non-negative and
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f475f1a027c8..d111adf4a0cb 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base)
/*
* No need to forward if we are close enough below jiffies.
* Also while executing timers, base->clk is 1 offset ahead
- * of jiffies to avoid endless requeuing to current jffies.
+ * of jiffies to avoid endless requeuing to current jiffies.
*/
if ((long)(jnow - base->clk) < 1)
return;
@@ -1271,7 +1271,7 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
* The counterpart to del_timer_wait_running().
*
* If there is a waiter for base->expiry_lock, then it was waiting for the
- * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * timer callback to finish. Drop expiry_lock and reacquire it. That allows
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 88e6b8ed6ca5..f0d5062d9cbc 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -108,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk)
/*
* If the current clocksource is not VDSO capable, then spare the
- * update of the high reolution parts.
+ * update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_data(vdata, tk);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c777627212f..915fe8790f04 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3545,7 +3545,11 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
{
char *tmp;
- if (iter->fmt == static_fmt_buf)
+ /*
+ * iter->tr is NULL when used with tp_printk, which makes
+ * this get called where it is not safe to call krealloc().
+ */
+ if (!iter->tr || iter->fmt == static_fmt_buf)
return NULL;
tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
@@ -3566,7 +3570,7 @@ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
if (WARN_ON_ONCE(!fmt))
return fmt;
- if (iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
+ if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
return fmt;
p = fmt;
@@ -4828,7 +4832,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
- if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -9692,7 +9696,7 @@ void __init early_trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
- kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (MEM_FAIL(!tracepoint_print_iter,
"Failed to allocate trace iterator\n"))
tracepoint_printk = 0;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dc971a68dda4..e57cc0870892 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -63,8 +63,10 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (event[0] == '\0')
- return -EINVAL;
+ if (event[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..9a4b980d695b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
+ ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(&ns->ns);
if (ret)
goto fail_free;
@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
}
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+ struct user_namespace *map_ns,
+ struct uid_gid_map *new_map)
+{
+ int idx;
+ const struct user_namespace *file_ns = file->f_cred->user_ns;
+ struct uid_gid_extent *extent0 = NULL;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent0 = &new_map->extent[idx];
+ else
+ extent0 = &new_map->forward[idx];
+ if (extent0->lower_first == 0)
+ break;
+
+ extent0 = NULL;
+ }
+
+ if (!extent0)
+ return true;
+
+ if (map_ns == file_ns) {
+ /* The process unshared its ns and is writing to its own
+ * /proc/self/uid_map. User already has full capabilites in
+ * the new namespace. Verify that the parent had CAP_SETFCAP
+ * when it unshared.
+ * */
+ if (!file_ns->parent_could_setfcap)
+ return false;
+ } else {
+ /* Process p1 is writing to uid_map of p2, who is in a child
+ * user namespace to p1's. Verify that the opener of the map
+ * file has CAP_SETFCAP against the parent of the new map
+ * namespace */
+ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+ return false;
+ }
+
+ return true;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map *parent_map)
{
struct seq_file *seq = file->private_data;
- struct user_namespace *ns = seq->private;
+ struct user_namespace *map_ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/*
* Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
goto out;
/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
goto out;
ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
struct uid_gid_map *new_map)
{
const struct cred *cred = file->f_cred;
+
+ if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+ return false;
+
/* Don't allow mappings that would allow anything that wouldn't
* be allowed without the establishment of unprivileged mappings.
*/
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 71109065bd8e..107bc38b1945 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void)
* update as well, the only side effect might be a cycle delay for
* the softlockup check.
*/
- for_each_cpu(cpu, &watchdog_allowed_mask)
+ for_each_cpu(cpu, &watchdog_allowed_mask) {
per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
- wq_watchdog_touch(-1);
+ wq_watchdog_touch(cpu);
+ }
}
void touch_softlockup_watchdog_sync(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0d150da252e8..b19d759e55a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
lockdep_assert_irqs_disabled();
- debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
@@ -1494,6 +1493,7 @@ retry:
worklist = &pwq->delayed_works;
}
+ debug_work_activate(work);
insert_work(pwq, work, worklist, work_flags);
out:
@@ -1630,7 +1630,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
+ WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
continue;
/* get the latest of pool and touched timestamps */
+ if (pool->cpu >= 0)
+ touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
+ else
+ touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
- touched = READ_ONCE(wq_watchdog_touched);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
- if (pool->cpu >= 0) {
- unsigned long cpu_touched =
- READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
- pool->cpu));
- if (time_after(cpu_touched, ts))
- ts = cpu_touched;
- }
-
/* did we stall? */
if (time_after(jiffies, ts + thresh)) {
lockup_detected = true;
@@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
- else
- wq_watchdog_touched = jiffies;
+
+ wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)