aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt2
-rw-r--r--kernel/audit.c15
-rw-r--r--kernel/bpf/cgroup.c4
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/compat.c24
-rw-r--r--kernel/context_tracking.c6
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/debug/debug_core.c34
-rw-r--r--kernel/debug/debug_core.h3
-rw-r--r--kernel/debug/kdb/kdb_bt.c116
-rw-r--r--kernel/debug/kdb/kdb_io.c231
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/dma/Kconfig12
-rw-r--r--kernel/dma/coherent.c16
-rw-r--r--kernel/dma/contiguous.c9
-rw-r--r--kernel/dma/debug.c39
-rw-r--r--kernel/dma/direct.c177
-rw-r--r--kernel/dma/mapping.c45
-rw-r--r--kernel/dma/remap.c55
-rw-r--r--kernel/dma/swiotlb.c2
-rw-r--r--kernel/events/core.c328
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c60
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c32
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/futex.c326
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/livepatch/patch.c3
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/locktorture.c9
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/locking/rtmutex.c6
-rw-r--r--kernel/locking/rwsem.c10
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/panic.c11
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/printk/printk.c10
-rw-r--r--kernel/rcu/rcu.h4
-rw-r--r--kernel/rcu/rcu_segcblist.c6
-rw-r--r--kernel/rcu/rcuperf.c16
-rw-r--r--kernel/rcu/rcutorture.c44
-rw-r--r--kernel/rcu/tree.c73
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/sched/core.c20
-rw-r--r--kernel/sched/cputime.c288
-rw-r--r--kernel/sched/deadline.c12
-rw-r--r--kernel/sched/fair.c1427
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c34
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/sched/stop_task.c9
-rw-r--r--kernel/sched/topology.c9
-rw-r--r--kernel/sched/wait.c37
-rw-r--r--kernel/seccomp.c28
-rw-r--r--kernel/stacktrace.c4
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c23
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/sysctl_binary.c1305
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/itimer.c189
-rw-r--r--kernel/time/tick-sched.c13
-rw-r--r--kernel/time/time.c32
-rw-r--r--kernel/trace/Kconfig27
-rw-r--r--kernel/trace/fgraph.c11
-rw-r--r--kernel/trace/ftrace.c613
-rw-r--r--kernel/trace/preemptirq_delay_test.c144
-rw-r--r--kernel/trace/ring_buffer_benchmark.c4
-rw-r--r--kernel/trace/trace.c214
-rw-r--r--kernel/trace/trace.h25
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_perf.c15
-rw-r--r--kernel/trace/trace_events.c29
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_hwlat.c15
-rw-r--r--kernel/trace/trace_kprobe.c27
-rw-r--r--kernel/trace/trace_output.c15
-rw-r--r--kernel/trace/trace_seq.c30
-rw-r--r--kernel/trace/trace_stat.c6
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c32
-rw-r--r--kernel/workqueue.c9
88 files changed, 3688 insertions, 2841 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index deff97217496..bf82259cff96 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -65,7 +65,7 @@ config PREEMPT_RT
preemptible priority-inheritance aware variants, enforcing
interrupt threading and introducing mechanisms to break up long
non-preemptible sections. This makes the kernel, except for very
- low level and critical code pathes (entry code, scheduler, low
+ low level and critical code paths (entry code, scheduler, low
level interrupt handling) fully preemptible and brings most
execution contexts under scheduler control.
diff --git a/kernel/audit.c b/kernel/audit.c
index da8dc0db5bd3..8e09f0f55b4b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -830,7 +830,7 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
- if (ac && rc < 0) {
+ if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
@@ -840,7 +840,7 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
- if (ac && rc < 0) {
+ if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
@@ -2155,18 +2155,19 @@ void audit_log_task_info(struct audit_buffer *ab)
EXPORT_SYMBOL(audit_log_task_info);
/**
- * audit_log_link_denied - report a link restriction denial
- * @operation: specific link operation
+ * audit_log_path_denied - report a path restriction denial
+ * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
+ * @operation: specific operation name
*/
-void audit_log_link_denied(const char *operation)
+void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
if (!audit_enabled || audit_dummy_context())
return;
- /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
- ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
+ /* Generate log with subject, operation, outcome. */
+ ab = audit_log_start(audit_context(), GFP_KERNEL, type);
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a3eaf08e7dd3..9f90d3c92bda 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -180,8 +180,8 @@ static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
struct bpf_prog_array *old_array)
{
- rcu_swap_protected(cgrp->bpf.effective[type], old_array,
- lockdep_is_held(&cgroup_mutex));
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 173e983619d7..caca752ee5e6 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -339,7 +339,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* up_read_non_owner(). The rwsem_release() is called
* here to release the lock from lockdep's perspective.
*/
- rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
+ rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
}
}
diff --git a/kernel/compat.c b/kernel/compat.c
index a2bc1d6ceb57..95005f849c68 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
-int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
-{
- struct compat_itimerval v32;
-
- if (copy_from_user(&v32, i, sizeof(struct compat_itimerval)))
- return -EFAULT;
- o->it_interval.tv_sec = v32.it_interval.tv_sec;
- o->it_interval.tv_usec = v32.it_interval.tv_usec;
- o->it_value.tv_sec = v32.it_value.tv_sec;
- o->it_value.tv_usec = v32.it_value.tv_usec;
- return 0;
-}
-
-int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i)
-{
- struct compat_itimerval v32;
-
- v32.it_interval.tv_sec = i->it_interval.tv_sec;
- v32.it_interval.tv_usec = i->it_interval.tv_usec;
- v32.it_value.tv_sec = i->it_value.tv_sec;
- v32.it_value.tv_usec = i->it_value.tv_usec;
- return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
-}
-
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index be01a4d627c9..0296b4bda8f1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -25,8 +25,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
-EXPORT_SYMBOL_GPL(context_tracking_enabled);
+DEFINE_STATIC_KEY_FALSE(context_tracking_key);
+EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
@@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
- static_branch_inc(&context_tracking_enabled);
+ static_branch_inc(&context_tracking_key);
}
if (initialized)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e2cad3ee2ead..a59cc980adad 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -336,7 +336,7 @@ static void lockdep_acquire_cpus_lock(void)
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_);
}
/*
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index f76d6f77dd5e..2b7c9b67931d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -441,6 +441,37 @@ setundefined:
return 0;
}
+#ifdef CONFIG_KGDB_KDB
+void kdb_dump_stack_on_cpu(int cpu)
+{
+ if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) {
+ dump_stack();
+ return;
+ }
+
+ if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) {
+ kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n",
+ cpu);
+ return;
+ }
+
+ /*
+ * In general, architectures don't support dumping the stack of a
+ * "running" process that's not the current one. From the point of
+ * view of the Linux, kernel processes that are looping in the kgdb
+ * slave loop are still "running". There's also no API (that actually
+ * works across all architectures) that can do a stack crawl based
+ * on registers passed as a parameter.
+ *
+ * Solve this conundrum by asking slave CPUs to do the backtrace
+ * themselves.
+ */
+ kgdb_info[cpu].exception_state |= DCPU_WANT_BT;
+ while (kgdb_info[cpu].exception_state & DCPU_WANT_BT)
+ cpu_relax();
+}
+#endif
+
/*
* Return true if there is a valid kgdb I/O module. Also if no
* debugger is attached a message can be printed to the console about
@@ -580,6 +611,9 @@ cpu_loop:
atomic_xchg(&kgdb_active, cpu);
break;
}
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) {
+ dump_stack();
+ kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT;
} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
if (!raw_spin_is_locked(&dbg_slave_lock))
goto return_normal;
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index b4a7c326d546..cd22b5f68831 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -33,7 +33,7 @@ struct kgdb_state {
#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */
struct debuggerinfo_struct {
void *debuggerinfo;
@@ -76,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
+extern void kdb_dump_stack_on_cpu(int cpu);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7e2379aa0a1e..4af48ac53625 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -22,20 +22,15 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
+
console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
- kdb_set_current_task(p);
- if (addr) {
- show_stack((struct task_struct *)p, addr);
- } else if (kdb_current_regs) {
-#ifdef CONFIG_X86
- show_stack(p, &kdb_current_regs->sp);
-#else
- show_stack(p, NULL);
-#endif
- } else {
- show_stack(p, NULL);
- }
+
+ if (!addr && kdb_task_has_cpu(p))
+ kdb_dump_stack_on_cpu(kdb_process_cpu(p));
+ else
+ show_stack(p, addr);
+
console_loglevel = old_lvl;
kdb_trap_printk--;
}
@@ -78,12 +73,12 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask,
- int argcount, int btaprompt)
+kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
{
- char buffer[2];
- if (kdb_getarea(buffer[0], (unsigned long)p) ||
- kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ char ch;
+
+ if (kdb_getarea(ch, (unsigned long)p) ||
+ kdb_getarea(ch, (unsigned long)(p+1)-1))
return KDB_BADADDR;
if (!kdb_task_state(p, mask))
return 0;
@@ -91,22 +86,47 @@ kdb_bt1(struct task_struct *p, unsigned long mask,
kdb_ps1(p);
kdb_show_stack(p, NULL);
if (btaprompt) {
- kdb_getstr(buffer, sizeof(buffer),
- "Enter <q> to end, <cr> to continue:");
- if (buffer[0] == 'q') {
- kdb_printf("\n");
+ kdb_printf("Enter <q> to end, <cr> or <space> to continue:");
+ do {
+ ch = kdb_getchar();
+ } while (!strchr("\r\n q", ch));
+ kdb_printf("\n");
+
+ /* reset the pager */
+ kdb_nextline = 1;
+
+ if (ch == 'q')
return 1;
- }
}
touch_nmi_watchdog();
return 0;
}
+static void
+kdb_bt_cpu(unsigned long cpu)
+{
+ struct task_struct *kdb_tsk;
+
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("WARNING: no process for cpu %ld\n", cpu);
+ return;
+ }
+
+ /* If a CPU failed to round up we could be here */
+ kdb_tsk = KDB_TSK(cpu);
+ if (!kdb_tsk) {
+ kdb_printf("WARNING: no task for cpu %ld\n", cpu);
+ return;
+ }
+
+ kdb_set_current_task(kdb_tsk);
+ kdb_bt1(kdb_tsk, ~0UL, false);
+}
+
int
kdb_bt(int argc, const char **argv)
{
int diag;
- int argcount = 5;
int btaprompt = 1;
int nextarg;
unsigned long addr;
@@ -125,7 +145,7 @@ kdb_bt(int argc, const char **argv)
/* Run the active tasks first */
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
}
/* Now the inactive tasks */
@@ -134,7 +154,7 @@ kdb_bt(int argc, const char **argv)
return 0;
if (task_curr(p))
continue;
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
} kdb_while_each_thread(g, p);
} else if (strcmp(argv[0], "btp") == 0) {
@@ -148,7 +168,7 @@ kdb_bt(int argc, const char **argv)
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p) {
kdb_set_current_task(p);
- return kdb_bt1(p, ~0UL, argcount, 0);
+ return kdb_bt1(p, ~0UL, false);
}
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
@@ -159,11 +179,10 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
kdb_set_current_task((struct task_struct *)addr);
- return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
struct task_struct *save_current_task = kdb_current_task;
- char buf[80];
if (argc > 1)
return KDB_ARGCOUNT;
if (argc == 1) {
@@ -171,35 +190,22 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
}
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
- argv = NULL;
if (cpu != ~0) {
- if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
- kdb_printf("no process for cpu %ld\n", cpu);
- return 0;
- }
- sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
- kdb_parse(buf);
- return 0;
- }
- kdb_printf("btc: cpu status: ");
- kdb_parse("cpu\n");
- for_each_online_cpu(cpu) {
- void *kdb_tsk = KDB_TSK(cpu);
-
- /* If a CPU failed to round up we could be here */
- if (!kdb_tsk) {
- kdb_printf("WARNING: no task for cpu %ld\n",
- cpu);
- continue;
+ kdb_bt_cpu(cpu);
+ } else {
+ /*
+ * Recursive use of kdb_parse, do not use argv after
+ * this point.
+ */
+ argv = NULL;
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ kdb_bt_cpu(cpu);
+ touch_nmi_watchdog();
}
-
- sprintf(buf, "btt 0x%px\n", kdb_tsk);
- kdb_parse(buf);
- touch_nmi_watchdog();
+ kdb_set_current_task(save_current_task);
}
- kdb_set_current_task(save_current_task);
return 0;
} else {
if (argc) {
@@ -211,7 +217,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ return kdb_bt1(kdb_current_task, ~0UL, false);
}
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 3a5184eb6977..8bcdded5d61f 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -49,14 +49,88 @@ static int kgdb_transition_check(char *buffer)
return 0;
}
-static int kdb_read_get_key(char *buffer, size_t bufsize)
+/**
+ * kdb_handle_escape() - validity check on an accumulated escape sequence.
+ * @buf: Accumulated escape characters to be examined. Note that buf
+ * is not a string, it is an array of characters and need not be
+ * nil terminated.
+ * @sz: Number of accumulated escape characters.
+ *
+ * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete,
+ * otherwise it returns a mapped key value to pass to the upper layers.
+ */
+static int kdb_handle_escape(char *buf, size_t sz)
+{
+ char *lastkey = buf + sz - 1;
+
+ switch (sz) {
+ case 1:
+ if (*lastkey == '\e')
+ return 0;
+ break;
+
+ case 2: /* \e<something> */
+ if (*lastkey == '[')
+ return 0;
+ break;
+
+ case 3:
+ switch (*lastkey) {
+ case 'A': /* \e[A, up arrow */
+ return 16;
+ case 'B': /* \e[B, down arrow */
+ return 14;
+ case 'C': /* \e[C, right arrow */
+ return 6;
+ case 'D': /* \e[D, left arrow */
+ return 2;
+ case '1': /* \e[<1,3,4>], may be home, del, end */
+ case '3':
+ case '4':
+ return 0;
+ }
+ break;
+
+ case 4:
+ if (*lastkey == '~') {
+ switch (buf[2]) {
+ case '1': /* \e[1~, home */
+ return 1;
+ case '3': /* \e[3~, del */
+ return 4;
+ case '4': /* \e[4~, end */
+ return 5;
+ }
+ }
+ break;
+ }
+
+ return -1;
+}
+
+/**
+ * kdb_getchar() - Read a single character from a kdb console (or consoles).
+ *
+ * Other than polling the various consoles that are currently enabled,
+ * most of the work done in this function is dealing with escape sequences.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources. Escape sequence processing
+ * has to be done as states in the polling loop.
+ *
+ * Return: The key pressed or a control code derived from an escape sequence.
+ */
+char kdb_getchar(void)
{
#define ESCAPE_UDELAY 1000
#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
- char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
- char *ped = escape_data;
+ char buf[4]; /* longest vt100 escape sequence is 4 bytes */
+ char *pbuf = buf;
int escape_delay = 0;
- get_char_func *f, *f_escape = NULL;
+ get_char_func *f, *f_prev = NULL;
int key;
for (f = &kdb_poll_funcs[0]; ; ++f) {
@@ -65,109 +139,37 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
touch_nmi_watchdog();
f = &kdb_poll_funcs[0];
}
- if (escape_delay == 2) {
- *ped = '\0';
- ped = escape_data;
- --escape_delay;
- }
- if (escape_delay == 1) {
- key = *ped++;
- if (!*ped)
- --escape_delay;
- break;
- }
+
key = (*f)();
if (key == -1) {
if (escape_delay) {
udelay(ESCAPE_UDELAY);
- --escape_delay;
+ if (--escape_delay == 0)
+ return '\e';
}
continue;
}
- if (bufsize <= 2) {
- if (key == '\r')
- key = '\n';
- *buffer++ = key;
- *buffer = '\0';
- return -1;
- }
- if (escape_delay == 0 && key == '\e') {
+
+ /*
+ * When the first character is received (or we get a change
+ * input source) we set ourselves up to handle an escape
+ * sequences (just in case).
+ */
+ if (f_prev != f) {
+ f_prev = f;
+ pbuf = buf;
escape_delay = ESCAPE_DELAY;
- ped = escape_data;
- f_escape = f;
- }
- if (escape_delay) {
- *ped++ = key;
- if (f_escape != f) {
- escape_delay = 2;
- continue;
- }
- if (ped - escape_data == 1) {
- /* \e */
- continue;
- } else if (ped - escape_data == 2) {
- /* \e<something> */
- if (key != '[')
- escape_delay = 2;
- continue;
- } else if (ped - escape_data == 3) {
- /* \e[<something> */
- int mapkey = 0;
- switch (key) {
- case 'A': /* \e[A, up arrow */
- mapkey = 16;
- break;
- case 'B': /* \e[B, down arrow */
- mapkey = 14;
- break;
- case 'C': /* \e[C, right arrow */
- mapkey = 6;
- break;
- case 'D': /* \e[D, left arrow */
- mapkey = 2;
- break;
- case '1': /* dropthrough */
- case '3': /* dropthrough */
- /* \e[<1,3,4>], may be home, del, end */
- case '4':
- mapkey = -1;
- break;
- }
- if (mapkey != -1) {
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- }
- continue;
- } else if (ped - escape_data == 4) {
- /* \e[<1,3,4><something> */
- int mapkey = 0;
- if (key == '~') {
- switch (escape_data[2]) {
- case '1': /* \e[1~, home */
- mapkey = 1;
- break;
- case '3': /* \e[3~, del */
- mapkey = 4;
- break;
- case '4': /* \e[4~, end */
- mapkey = 5;
- break;
- }
- }
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- continue;
- }
}
- break; /* A key to process */
+
+ *pbuf++ = key;
+ key = kdb_handle_escape(buf, pbuf - buf);
+ if (key < 0) /* no escape sequence; return best character */
+ return buf[pbuf - buf == 2 ? 1 : 0];
+ if (key > 0)
+ return key;
}
- return key;
+
+ unreachable();
}
/*
@@ -188,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
* function. It is not reentrant - it relies on the fact
* that while kdb is running on only one "master debug" cpu.
* Remarks:
- *
- * The buffer size must be >= 2. A buffer size of 2 means that the caller only
- * wants a single key.
- *
- * An escape key could be the start of a vt100 control sequence such as \e[D
- * (left arrow) or it could be a character in its own right. The standard
- * method for detecting the difference is to wait for 2 seconds to see if there
- * are any other characters. kdb is complicated by the lack of a timer service
- * (interrupts are off), by multiple input sources and by the need to sometimes
- * return after just one key. Escape sequence processing has to be done as
- * states in the polling loop.
+ * The buffer size must be >= 2.
*/
static char *kdb_read(char *buffer, size_t bufsize)
@@ -233,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
*cp = '\0';
kdb_printf("%s", buffer);
poll_again:
- key = kdb_read_get_key(buffer, bufsize);
- if (key == -1)
- return buffer;
+ key = kdb_getchar();
if (key != 9)
tab = 0;
switch (key) {
@@ -746,7 +736,7 @@ kdb_printit:
/* check for having reached the LINES number of printed lines */
if (kdb_nextline >= linecount) {
- char buf1[16] = "";
+ char ch;
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -781,39 +771,38 @@ kdb_printit:
if (logging)
printk("%s", moreprompt);
- kdb_read(buf1, 2); /* '2' indicates to return
- * immediately after getting one key. */
+ ch = kdb_getchar();
kdb_nextline = 1; /* Really set output line 1 */
/* empty and reset the buffer: */
kdb_buffer[0] = '\0';
next_avail = kdb_buffer;
size_avail = sizeof(kdb_buffer);
- if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ if ((ch == 'q') || (ch == 'Q')) {
/* user hit q or Q */
KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
KDB_STATE_CLEAR(PAGER);
/* end of command output; back to normal mode */
kdb_grepping_flag = 0;
kdb_printf("\n");
- } else if (buf1[0] == ' ') {
+ } else if (ch == ' ') {
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '\n') {
+ } else if (ch == '\n' || ch == '\r') {
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ } else if (ch == '/' && !kdb_grepping_flag) {
kdb_printf("\r");
kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
kdbgetenv("SEARCHPROMPT") ?: "search> ");
*strchrnul(kdb_grep_string, '\n') = '\0';
kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] && buf1[0] != '\n') {
- /* user hit something other than enter */
+ } else if (ch) {
+ /* user hit something unexpected */
suspend_grep = 1; /* for this recursion */
- if (buf1[0] != '/')
+ if (ch != '/')
kdb_printf(
"\nOnly 'q', 'Q' or '/' are processed at "
"more prompt, input ignored\n");
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 2118d8258b7c..55d052061ef9 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig(struct task_struct *p, int sig);
extern void kdb_meminfo_proc_show(void);
+extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 73c5c2b8e824..4c103a24e380 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
config ARCH_HAS_DMA_PREP_COHERENT
bool
-config ARCH_HAS_DMA_COHERENT_TO_PFN
- bool
-
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
@@ -68,9 +65,18 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+ bool
+
config DMA_REMAP
depends on MMU
select GENERIC_ALLOCATOR
+ select DMA_NONCOHERENT_MMAP
bool
config DMA_DIRECT_REMAP
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 545e3869b0e3..551b0eb7028a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -123,8 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
return ret;
}
-static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
- ssize_t size, dma_addr_t *dma_handle)
+static void *__dma_alloc_from_coherent(struct device *dev,
+ struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
{
int order = get_order(size);
unsigned long flags;
@@ -143,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
/*
* Memory was found in the coherent area.
*/
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT);
ret = mem->virt_base + (pageno << PAGE_SHIFT);
spin_unlock_irqrestore(&mem->spinlock, flags);
memset(ret, 0, size);
@@ -175,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
if (!mem)
return 0;
- *ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+ *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
return 1;
}
-void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
{
if (!dma_coherent_default_memory)
return NULL;
- return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
- dma_handle);
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
}
static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 69cfb4345388..daa4e6eefdde 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area;
* Users, who want to set the size of global CMA area for their system
* should use cma= kernel parameter.
*/
-static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
-static phys_addr_t size_cmdline = -1;
-static phys_addr_t base_cmdline;
-static phys_addr_t limit_cmdline;
+static const phys_addr_t size_bytes __initconst =
+ (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+static phys_addr_t size_cmdline __initdata = -1;
+static phys_addr_t base_cmdline __initdata;
+static phys_addr_t limit_cmdline __initdata;
static int __init early_cma(char *p)
{
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index a26170469543..2031ed1ad7fa 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -27,7 +27,7 @@
#include <asm/sections.h>
-#define HASH_SIZE 1024ULL
+#define HASH_SIZE 16384ULL
#define HASH_FN_SHIFT 13
#define HASH_FN_MASK (HASH_SIZE - 1)
@@ -54,40 +54,40 @@ enum map_err_types {
* struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
* @list: node on pre-allocated free_entries list
* @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
- * @type: single, page, sg, coherent
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
* @size: length of the mapping
+ * @type: single, page, sg, coherent
* @direction: enum dma_data_direction
* @sg_call_ents: 'nents' from dma_map_sg
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @pfn: page frame of the start address
+ * @offset: offset of mapping relative to pfn
* @map_err_type: track whether dma_mapping_error() was checked
* @stacktrace: support backtraces when a violation is detected
*/
struct dma_debug_entry {
struct list_head list;
struct device *dev;
- int type;
- unsigned long pfn;
- size_t offset;
u64 dev_addr;
u64 size;
+ int type;
int direction;
int sg_call_ents;
int sg_mapped_ents;
+ unsigned long pfn;
+ size_t offset;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
#endif
-};
+} ____cacheline_aligned_in_smp;
typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
struct hash_bucket {
struct list_head list;
spinlock_t lock;
-} ____cacheline_aligned_in_smp;
+};
/* Hash list to save the allocated dma addresses */
static struct hash_bucket dma_entry_hash[HASH_SIZE];
@@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
* Give up exclusive access to the hash bucket
*/
static void put_hash_bucket(struct hash_bucket *bucket,
- unsigned long *flags)
+ unsigned long flags)
__releases(&bucket->lock)
{
- unsigned long __flags = *flags;
-
- spin_unlock_irqrestore(&bucket->lock, __flags);
+ spin_unlock_irqrestore(&bucket->lock, flags);
}
static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b)
@@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
/*
* Nothing found, go back a hash bucket
*/
- put_hash_bucket(*bucket, flags);
+ put_hash_bucket(*bucket, *flags);
range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
@@ -420,6 +418,7 @@ void debug_dma_dump_mappings(struct device *dev)
}
spin_unlock_irqrestore(&bucket->lock, flags);
+ cond_resched();
}
}
@@ -608,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
@@ -1001,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref)
if (!entry) {
/* must drop lock before calling dma_mapping_error */
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
if (dma_mapping_error(ref->dev, ref->dev_addr)) {
err_printk(ref->dev, NULL,
@@ -1083,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref)
hash_bucket_del(entry);
dma_entry_free(entry);
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
static void check_for_stack(struct device *dev,
@@ -1203,7 +1202,7 @@ static void check_sync(struct device *dev,
}
out:
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
static void check_sg_segment(struct device *dev, struct scatterlist *sg)
@@ -1318,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
}
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
}
EXPORT_SYMBOL(debug_dma_mapping_error);
@@ -1391,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev,
if (entry)
mapped_ents = entry->sg_mapped_ents;
- put_hash_bucket(bucket, &flags);
+ put_hash_bucket(bucket, flags);
return mapped_ents;
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0b67c04e531b..6af7ae83c4ad 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -12,6 +12,7 @@
#include <linux/dma-contiguous.h>
#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
+#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/swiotlb.h>
@@ -26,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
if (!dev->dma_mask) {
dev_err_once(dev, "DMA map on device without dma_mask\n");
- } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+ } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
dev_err_once(dev,
- "overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
+ "overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
}
WARN_ON_ONCE(1);
}
@@ -42,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
return phys_to_dma(dev, phys);
}
+static inline struct page *dma_direct_to_page(struct device *dev,
+ dma_addr_t dma_addr)
+{
+ return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
u64 dma_direct_get_required_mask(struct device *dev)
{
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
@@ -50,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev)
}
static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
- u64 *phys_mask)
+ u64 *phys_limit)
{
- if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
- dma_mask = dev->bus_dma_mask;
+ u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
if (force_dma_unencrypted(dev))
- *phys_mask = __dma_to_phys(dev, dma_mask);
+ *phys_limit = __dma_to_phys(dev, dma_limit);
else
- *phys_mask = dma_to_phys(dev, dma_mask);
+ *phys_limit = dma_to_phys(dev, dma_limit);
/*
* Optimistically try the zone that the physical address mask falls
@@ -68,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
* Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
* zones.
*/
- if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits))
+ if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
return GFP_DMA;
- if (*phys_mask <= DMA_BIT_MASK(32))
+ if (*phys_limit <= DMA_BIT_MASK(32))
return GFP_DMA32;
return 0;
}
@@ -78,16 +84,16 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
return phys_to_dma_direct(dev, phys) + size - 1 <=
- min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+ gfp_t gfp, unsigned long attrs)
{
size_t alloc_size = PAGE_ALIGN(size);
int node = dev_to_node(dev);
struct page *page = NULL;
- u64 phys_mask;
+ u64 phys_limit;
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
@@ -95,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
+ &phys_limit);
page = dma_alloc_contiguous(dev, alloc_size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, alloc_size);
@@ -109,7 +115,7 @@ again:
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
- phys_mask < DMA_BIT_MASK(64) &&
+ phys_limit < DMA_BIT_MASK(64) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;
@@ -130,7 +136,16 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_alloc_need_uncached(dev, attrs) &&
+ !gfpflags_allow_blocking(gfp)) {
+ ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+ if (!ret)
+ return NULL;
+ goto done;
+ }
+
+ page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
if (!page)
return NULL;
@@ -139,9 +154,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
/* return the page pointer as the opaque cookie */
- return page;
+ ret = page;
+ goto done;
+ }
+
+ if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_alloc_need_uncached(dev, attrs)) ||
+ (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ /* remove any dirty cache lines on the kernel alias */
+ arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+
+ /* create a coherent mapping */
+ ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+ dma_pgprot(dev, PAGE_KERNEL, attrs),
+ __builtin_return_address(0));
+ if (!ret) {
+ dma_free_contiguous(dev, page, size);
+ return ret;
+ }
+
+ memset(ret, 0, size);
+ goto done;
}
if (PageHighMem(page)) {
@@ -152,17 +186,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
* so log an error and fail.
*/
dev_info(dev, "Rejecting highmem page from CMA.\n");
- __dma_direct_free_pages(dev, size, page);
+ dma_free_contiguous(dev, page, size);
return NULL;
}
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
+ if (force_dma_unencrypted(dev))
set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
- *dma_handle = __phys_to_dma(dev, page_to_phys(page));
- } else {
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
- }
+
memset(ret, 0, size);
if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
@@ -170,15 +201,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
arch_dma_prep_coherent(page, size);
ret = uncached_kernel_address(ret);
}
-
+done:
+ if (force_dma_unencrypted(dev))
+ *dma_handle = __phys_to_dma(dev, page_to_phys(page));
+ else
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
return ret;
}
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
-{
- dma_free_contiguous(dev, page, size);
-}
-
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
@@ -187,23 +217,28 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
- __dma_direct_free_pages(dev, size, cpu_addr);
+ dma_free_contiguous(dev, cpu_addr, size);
return;
}
+ if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
+ return;
+
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
- if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
- dma_alloc_need_uncached(dev, attrs))
- cpu_addr = cached_kernel_address(cpu_addr);
- __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ vunmap(cpu_addr);
+
+ dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
dma_alloc_need_uncached(dev, attrs))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
@@ -213,6 +248,7 @@ void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
dma_alloc_need_uncached(dev, attrs))
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
else
@@ -230,7 +266,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, paddr, size, dir);
+ arch_sync_dma_for_device(paddr, size, dir);
}
EXPORT_SYMBOL(dma_direct_sync_single_for_device);
@@ -248,7 +284,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, paddr, sg->length,
+ arch_sync_dma_for_device(paddr, sg->length,
dir);
}
}
@@ -264,8 +300,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, addr);
if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(dev, paddr, size, dir);
- arch_sync_dma_for_cpu_all(dev);
+ arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_all();
}
if (unlikely(is_swiotlb_buffer(paddr)))
@@ -283,7 +319,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+ arch_sync_dma_for_cpu(paddr, sg->length, dir);
if (unlikely(is_swiotlb_buffer(paddr)))
swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
@@ -291,7 +327,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
}
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu_all(dev);
+ arch_sync_dma_for_cpu_all();
}
EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
@@ -325,7 +361,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
size_t size)
{
return swiotlb_force != SWIOTLB_FORCE &&
- dma_capable(dev, dma_addr, size);
+ dma_capable(dev, dma_addr, size, true);
}
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -342,7 +378,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
}
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(dev, phys, size, dir);
+ arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
}
EXPORT_SYMBOL(dma_direct_map_page);
@@ -374,7 +410,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
{
dma_addr_t dma_addr = paddr;
- if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
report_addr(dev, dma_addr, size);
return DMA_MAPPING_ERROR;
}
@@ -383,6 +419,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
}
EXPORT_SYMBOL(dma_direct_map_resource);
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = dma_direct_to_page(dev, dma_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+#ifdef CONFIG_MMU
+bool dma_direct_can_mmap(struct device *dev)
+{
+ return dev_is_dma_coherent(dev) ||
+ IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+#else /* CONFIG_MMU */
+bool dma_direct_can_mmap(struct device *dev)
+{
+ return false;
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ return -ENXIO;
+}
+#endif /* CONFIG_MMU */
+
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
* able to satisfy them - either by not supporting more physical memory, or by
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index d9334f31a5af..12ff766ec1fa 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page;
+ struct page *page = virt_to_page(cpu_addr);
int ret;
- if (!dev_is_dma_coherent(dev)) {
- unsigned long pfn;
-
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
- return -ENXIO;
-
- /* If the PFN is not valid, we do not have a struct page */
- pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
- if (!pfn_valid(pfn))
- return -ENXIO;
- page = pfn_to_page(pfn);
- } else {
- page = virt_to_page(cpu_addr);
- }
-
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
if (!ret)
sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
@@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
const struct dma_map_ops *ops = get_dma_ops(dev);
if (dma_is_direct(ops))
- return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
size, attrs);
if (!ops->get_sgtable)
return -ENXIO;
@@ -192,7 +177,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
- unsigned long pfn;
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -203,19 +187,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
if (off >= count || user_count > count - off)
return -ENXIO;
- if (!dev_is_dma_coherent(dev)) {
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
- return -ENXIO;
-
- /* If the PFN is not valid, we do not have a struct page */
- pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
- if (!pfn_valid(pfn))
- return -ENXIO;
- } else {
- pfn = page_to_pfn(virt_to_page(cpu_addr));
- }
-
- return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
@@ -233,12 +206,8 @@ bool dma_can_mmap(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (dma_is_direct(ops)) {
- return IS_ENABLED(CONFIG_MMU) &&
- (dev_is_dma_coherent(dev) ||
- IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
- }
-
+ if (dma_is_direct(ops))
+ return dma_direct_can_mmap(dev);
return ops->mmap != NULL;
}
EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -263,7 +232,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
const struct dma_map_ops *ops = get_dma_ops(dev);
if (dma_is_direct(ops))
- return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+ return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
attrs);
if (!ops->mmap)
return -ENXIO;
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index c00b9258fa6a..d47bd40fc0f5 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -210,59 +210,4 @@ bool dma_free_from_pool(void *start, size_t size)
gen_pool_free(atomic_pool, (unsigned long)start, size);
return true;
}
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t flags, unsigned long attrs)
-{
- struct page *page = NULL;
- void *ret;
-
- size = PAGE_ALIGN(size);
-
- if (!gfpflags_allow_blocking(flags)) {
- ret = dma_alloc_from_pool(size, &page, flags);
- if (!ret)
- return NULL;
- goto done;
- }
-
- page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
- if (!page)
- return NULL;
-
- /* remove any dirty cache lines on the kernel alias */
- arch_dma_prep_coherent(page, size);
-
- /* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
- __builtin_return_address(0));
- if (!ret) {
- __dma_direct_free_pages(dev, size, page);
- return ret;
- }
-
- memset(ret, 0, size);
-done:
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
- return ret;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_handle, unsigned long attrs)
-{
- if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
- phys_addr_t phys = dma_to_phys(dev, dma_handle);
- struct page *page = pfn_to_page(__phys_to_pfn(phys));
-
- vunmap(vaddr);
- __dma_direct_free_pages(dev, size, page);
- }
-}
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
- dma_addr_t dma_addr)
-{
- return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 673a2cdb2656..9280d6f8271e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
/* Ensure that the address returned is DMA'ble */
*dma_addr = __phys_to_dma(dev, *phys);
- if (unlikely(!dma_capable(dev, *dma_addr, size))) {
+ if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
return false;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5de0b801bc7b..4ff86d57f9e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event)
}
}
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
static int perf_get_aux_event(struct perf_event *event,
struct perf_event *group_leader)
{
@@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event,
if (!group_leader)
return 0;
- if (!perf_aux_output_match(event, group_leader))
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -2666,6 +2681,25 @@ perf_install_in_context(struct perf_event_context *ctx,
*/
smp_store_release(&event->ctx, ctx);
+ /*
+ * perf_event_attr::disabled events will not run and can be initialized
+ * without IPI. Except when this is the first event for the context, in
+ * that case we need the magic of the IPI to set ctx->is_active.
+ *
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
+ * event will issue the IPI and reprogram the hardware.
+ */
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
return;
@@ -3204,10 +3238,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ struct pmu *pmu = ctx->pmu;
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (pmu->swap_task_ctx)
+ pmu->swap_task_ctx(ctx, next_ctx);
+ else
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -4229,8 +4274,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ err = perf_allow_cpu(&event->attr);
+ if (err)
+ return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4539,6 +4585,8 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4992,6 +5040,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -5288,6 +5340,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5639,10 +5696,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- if (!rb->aux_mmap_locked)
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- else
- atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
@@ -5753,6 +5808,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5859,13 +5918,7 @@ accounting:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
- if (user_locked <= user_lock_limit) {
- /* charge all to locked_vm */
- } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) {
- /* charge all to pinned_vm */
- extra = user_extra;
- user_extra = 0;
- } else {
+ if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
@@ -5878,7 +5931,7 @@ accounting:
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -6208,6 +6261,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct ring_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct ring_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ unsigned long pad;
+ struct ring_buffer *rb;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6527,6 +6696,13 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6715,6 +6891,35 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -10066,7 +10271,7 @@ static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- int cpu, ret;
+ int cpu, ret, max = PERF_TYPE_MAX;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
@@ -10079,12 +10284,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto skip_type;
pmu->name = name;
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
+ if (type != PERF_TYPE_SOFTWARE) {
+ if (type >= 0)
+ max = type;
+
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
goto free_pdc;
- }
+
+ WARN_ON(type >= 0 && ret != type);
+
+ type = ret;
}
pmu->type = type;
@@ -10161,7 +10371,16 @@ got_cpu_context:
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- list_add_rcu(&pmu->entry, &pmus);
+ /*
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
+ * since these cannot be in the IDR. This way the linear search
+ * is fast, provided a valid software event is provided.
+ */
+ if (type == PERF_TYPE_SOFTWARE || !name)
+ list_add_rcu(&pmu->entry, &pmus);
+ else
+ list_add_tail_rcu(&pmu->entry, &pmus);
+
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -10174,7 +10393,7 @@ free_dev:
put_device(pmu->dev);
free_idr:
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
@@ -10196,7 +10415,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
if (pmu->nr_addr_filters)
@@ -10266,9 +10485,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
+ int idx, type, ret;
struct pmu *pmu;
- int idx;
- int ret;
idx = srcu_read_lock(&pmus_srcu);
@@ -10280,13 +10498,28 @@ static struct pmu *perf_init_event(struct perf_event *event)
goto unlock;
}
+ /*
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * are often aliases for PERF_TYPE_RAW.
+ */
+ type = event->attr.type;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
+ type = PERF_TYPE_RAW;
+
+again:
rcu_read_lock();
- pmu = idr_find(&pmu_idr, event->attr.type);
+ pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type) {
+ type = event->attr.type;
+ goto again;
+ }
+
if (ret)
pmu = ERR_PTR(ret);
+
goto unlock;
}
@@ -10618,11 +10851,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ err = security_perf_event_alloc(event);
+ if (err)
+ goto err_callchain_buffer;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10673,7 +10915,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->size = size;
- if (attr->__reserved_1 || attr->__reserved_2)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10711,9 +10953,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel(attr);
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10926,13 +11170,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
}
if (attr.namespaces) {
@@ -10949,9 +11199,11 @@ SYSCALL_DEFINE5(perf_event_open,
}
/* Only privileged users can get physical addresses */
- if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
err = security_locked_down(LOCKDOWN_PERF);
if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
@@ -11213,7 +11465,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
goto err_locked;
/*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 3aef4191798c..747d67f130cb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -50,6 +50,7 @@ struct ring_buffer {
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
+ int aux_in_sampling;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ffb59a4ef4ff..7ffd5c763f93 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
}
EXPORT_SYMBOL_GPL(perf_get_aux);
+/*
+ * Copy out AUX data from an AUX handle.
+ */
+long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to)
+{
+ unsigned long tocopy, remainder, len = 0;
+ struct ring_buffer *rb = aux_handle->rb;
+ void *addr;
+
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+
+ do {
+ tocopy = PAGE_SIZE - offset_in_page(from);
+ if (to > from)
+ tocopy = min(tocopy, to - from);
+ if (!tocopy)
+ break;
+
+ addr = rb->aux_pages[from >> PAGE_SHIFT];
+ addr += offset_in_page(from);
+
+ remainder = perf_output_copy(handle, addr, tocopy);
+ if (remainder)
+ return -EFAULT;
+
+ len += tocopy;
+ from += tocopy;
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ } while (to != from);
+
+ return len;
+}
+
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
@@ -754,6 +790,14 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
+static void perf_mmap_free_page(void *addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
struct ring_buffer *rb;
@@ -788,9 +832,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
fail_data_pages:
for (i--; i >= 0; i--)
- free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
- free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
@@ -799,21 +843,13 @@ fail:
return NULL;
}
-static void perf_mmap_free_page(unsigned long addr)
-{
- struct page *page = virt_to_page((void *)addr);
-
- page->mapping = NULL;
- __free_page(page);
-}
-
void rb_free(struct ring_buffer *rb)
{
int i;
- perf_mmap_free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
- perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c74761004ee5..ece7e13f6e4a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
+ if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index f2d20ab74422..bcbd59888e67 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -437,7 +437,7 @@ static void exit_mm(void)
struct mm_struct *mm = current->mm;
struct core_state *core_state;
- mm_release(current, mm);
+ exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
@@ -746,32 +746,12 @@ void __noreturn do_exit(long code)
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
- /*
- * We can do this unlocked here. The futex code uses
- * this flag just to verify whether the pi state
- * cleanup has been done or not. In the worst case it
- * loops once more. We pretend that the cleanup was
- * done as there is no way to return. Either the
- * OWNER_DIED bit is set by now or we push the blocked
- * task into the wait for ever nirwana as well.
- */
- tsk->flags |= PF_EXITPIDONE;
+ futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
exit_signals(tsk); /* sets PF_EXITING */
- /*
- * Ensure that all new tsk->pi_lock acquisitions must observe
- * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
- */
- smp_mb();
- /*
- * Ensure that we must observe the pi_state in exit_mm() ->
- * mm_release() -> exit_pi_state_list().
- */
- raw_spin_lock_irq(&tsk->pi_lock);
- raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -846,12 +826,6 @@ void __noreturn do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
- /*
- * We can do this unlocked here. The futex code uses this flag
- * just to verify whether the pi state cleanup has been done
- * or not. In the worst case it loops once more.
- */
- tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context(tsk);
@@ -1435,7 +1409,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
- TASK_INTERRUPTIBLE, 1, p);
+ TASK_INTERRUPTIBLE, p);
}
static long do_wait(struct wait_opts *wo)
diff --git a/kernel/fork.c b/kernel/fork.c
index 35f91ee91057..21c6c1e29b98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,7 +40,6 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
-#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -94,6 +93,7 @@
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
+#include <linux/kasan.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -224,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
+ /* Clear the KASAN shadow of the stack. */
+ kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
@@ -1283,24 +1286,8 @@ static int wait_for_vfork_done(struct task_struct *child,
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- /* Get rid of any futexes when releasing the mm */
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-#endif
-
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
@@ -1333,6 +1320,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
+void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exit_release(tsk);
+ mm_release(tsk, mm);
+}
+
+void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exec_release(tsk);
+ mm_release(tsk, mm);
+}
+
/**
* dup_mm() - duplicates an existing mm structure
* @tsk: the task_struct with which the new mm will be associated.
@@ -2124,14 +2123,8 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
-#ifdef CONFIG_FUTEX
- p->robust_list = NULL;
-#ifdef CONFIG_COMPAT
- p->compat_robust_list = NULL;
-#endif
- INIT_LIST_HEAD(&p->pi_state_list);
- p->pi_state_cache = NULL;
-#endif
+ futex_init_task(p);
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
diff --git a/kernel/futex.c b/kernel/futex.c
index bd18f60e4c6c..03c518e9747e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -325,6 +325,12 @@ static inline bool should_fail_futex(bool fshared)
}
#endif /* CONFIG_FAIL_FUTEX */
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
static inline void futex_get_mm(union futex_key *key)
{
mmgrab(key->private.mm);
@@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
@@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr)
}
raw_spin_unlock_irq(&curr->pi_lock);
}
-
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
/*
@@ -1169,16 +1176,47 @@ out_error:
return ret;
}
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
static int handle_exit_race(u32 __user *uaddr, u32 uval,
struct task_struct *tsk)
{
u32 uval2;
/*
- * If PF_EXITPIDONE is not yet set, then try again.
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
*/
- if (tsk && !(tsk->flags & PF_EXITPIDONE))
- return -EAGAIN;
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
/*
* Reread the user space value to handle the following situation:
@@ -1196,8 +1234,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* *uaddr = 0xC0000000; tsk = get_task(PID);
* } if (!tsk->flags & PF_EXITING) {
* ... attach();
- * tsk->flags |= PF_EXITPIDONE; } else {
- * if (!(tsk->flags & PF_EXITPIDONE))
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
* return -EAGAIN;
* return -ESRCH; <--- FAIL
* }
@@ -1228,7 +1267,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
* it after doing proper sanity checks.
*/
static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
@@ -1253,22 +1293,33 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
}
/*
- * We need to look at the task state flags to figure out,
- * whether the task is exiting. To protect against the do_exit
- * change of the task flags, we do this protected by
- * p->pi_lock:
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->flags & PF_EXITING)) {
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
/*
- * The task is on the way out. When PF_EXITPIDONE is
- * set, we know that the task has finished the
- * cleanup:
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
*/
int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
return ret;
}
@@ -1307,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
static int lookup_pi_state(u32 __user *uaddr, u32 uval,
struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1322,7 +1374,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uaddr, uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1350,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* lookup
* @task: the task to perform the atomic lock work for. This will
* be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Return:
@@ -1358,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* - <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
*/
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
struct futex_q *top_waiter;
@@ -1432,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uaddr, newval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
}
/**
@@ -1480,7 +1540,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
/*
* Queue the task for later wakeup for after we've released
- * the hb->lock. wake_q_add() grabs reference to p.
+ * the hb->lock.
*/
wake_q_add_safe(wake_q, p);
}
@@ -1850,6 +1910,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* @key1: the from futex key
* @key2: the to futex key
* @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1857,16 +1919,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
* Return:
* - 0 - failed to acquire the lock atomically;
* - >0 - acquired the lock, return value is vpid of the top_waiter
* - <0 - error
*/
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
- struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2,
- union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
{
struct futex_q *top_waiter = NULL;
u32 curval;
@@ -1903,7 +1969,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
*/
vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- set_waiters);
+ exiting, set_waiters);
if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
@@ -2032,6 +2098,8 @@ retry_private:
}
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ struct task_struct *exiting = NULL;
+
/*
* Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -2039,7 +2107,8 @@ retry_private:
* faults rather in the requeue loop below.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state, nr_requeue);
+ &key2, &pi_state,
+ &exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or is
@@ -2066,7 +2135,8 @@ retry_private:
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+ &pi_state, &exiting);
}
switch (ret) {
@@ -2084,17 +2154,24 @@ retry_private:
if (!ret)
goto retry;
goto out;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Owner is exiting and we just wait for the
+ * - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -2801,6 +2878,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
+ struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
@@ -2822,7 +2900,8 @@ retry:
retry_private:
hb = queue_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
if (unlikely(ret)) {
/*
* Atomic work succeeded and we got the lock,
@@ -2835,15 +2914,22 @@ retry_private:
goto out_unlock_put_key;
case -EFAULT:
goto uaddr_faulted;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Task is exiting and we just wait for the
+ * - EBUSY: Task is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -3452,11 +3538,16 @@ err_unlock:
return ret;
}
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
/*
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
{
u32 uval, uninitialized_var(nval), mval;
int err;
@@ -3469,6 +3560,42 @@ retry:
if (get_user(uval, uaddr))
return -1;
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) User space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. The user space futex value is
+ * uncontended and the rest of the user space mutex state is
+ * consistent, so a woken waiter will just take over the
+ * uncontended futex. Setting the OWNER_DIED bit would create
+ * inconsistent state and malfunction of the user space owner died
+ * handling.
+ */
+ if (pending_op && !pi && !uval) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
return 0;
@@ -3547,7 +3674,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
*
* We silently return on any sign of list-walking problem.
*/
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3588,10 +3715,11 @@ void exit_robust_list(struct task_struct *curr)
* A pending lock might already be on the list, so
* don't process it twice:
*/
- if (entry != pending)
+ if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi))
+ curr, pi, HANDLE_DEATH_LIST))
return;
+ }
if (rc)
return;
entry = next_entry;
@@ -3605,9 +3733,118 @@ void exit_robust_list(struct task_struct *curr)
cond_resched();
}
- if (pending)
+ if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
- curr, pip);
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent futher damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3737,7 +3974,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
*
* We silently return on any sign of list-walking problem.
*/
-void compat_exit_robust_list(struct task_struct *curr)
+static void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3784,7 +4021,8 @@ void compat_exit_robust_list(struct task_struct *curr)
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
- if (handle_futex_death(uaddr, curr, pi))
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
return;
}
if (rc)
@@ -3803,7 +4041,7 @@ void compat_exit_robust_list(struct task_struct *curr)
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
- handle_futex_death(uaddr, curr, pip);
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 79f252af7dee..a2df93948665 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1304,7 +1304,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
if (kernel_map) {
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
- phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_vaddr = (unsigned long) _text;
phdr->p_filesz = phdr->p_memsz = _end - _text;
phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
ehdr->e_phnum++;
@@ -1321,7 +1321,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr->p_offset = mstart;
phdr->p_paddr = mstart;
- phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_vaddr = (unsigned long) __va(mstart);
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index bd43537702bd..b552cf2d85f8 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func)
ops->fops.func = klp_ftrace_handler;
ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
FTRACE_OPS_FL_DYNAMIC |
- FTRACE_OPS_FL_IPMODIFY;
+ FTRACE_OPS_FL_IPMODIFY |
+ FTRACE_OPS_FL_PERMANENT;
list_add(&ops->node, &klp_ops);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 233459c03b5a..32282e7112d3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4208,11 +4208,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
}
/*
- * Remove the lock to the list of currently held locks - this gets
+ * Remove the lock from the list of currently held locks - this gets
* called on mutex_unlock()/spin_unlock*() (or on a failed
* mutex_lock_interruptible()).
- *
- * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
__lock_release(struct lockdep_map *lock, unsigned long ip)
@@ -4491,8 +4489,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);
-void lock_release(struct lockdep_map *lock, int nested,
- unsigned long ip)
+void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index c513031cd7e3..99475a66c94f 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
@@ -889,16 +888,16 @@ static int __init lock_torture_init(void)
cxt.nrealwriters_stress = 2 * num_online_cpus();
#ifdef CONFIG_DEBUG_MUTEXES
- if (strncmp(torture_type, "mutex", 5) == 0)
+ if (str_has_prefix(torture_type, "mutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
- if (strncmp(torture_type, "rtmutex", 7) == 0)
+ if (str_has_prefix(torture_type, "rtmutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
- if ((strncmp(torture_type, "spin", 4) == 0) ||
- (strncmp(torture_type, "rw_lock", 7) == 0))
+ if ((str_has_prefix(torture_type, "spin")) ||
+ (str_has_prefix(torture_type, "rw_lock")))
cxt.debug_lock = true;
#endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 468a9b8422e3..54cc5f9286e9 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -733,6 +733,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
*/
void __sched mutex_unlock(struct mutex *lock)
{
+#ifdef CONFIG_DEBUG_MUTEXES
+ WARN_ON(in_interrupt());
+#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
if (__mutex_unlock_fast(lock))
return;
@@ -1091,7 +1094,7 @@ err:
err_early_kill:
spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
preempt_enable();
return ret;
}
@@ -1225,7 +1228,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
DEFINE_WAKE_Q(wake_q);
unsigned long owner;
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
/*
* Release the lock before (potentially) taking the spinlock such that
@@ -1413,6 +1416,7 @@ int __sched mutex_trylock(struct mutex *lock)
#ifdef CONFIG_DEBUG_MUTEXES
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ WARN_ON(in_interrupt());
#endif
locked = __mutex_trylock(lock);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2874bf556162..851bbb10819d 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1517,7 +1517,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
return ret;
}
@@ -1561,7 +1561,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
RT_MUTEX_MIN_CHAINWALK,
rt_mutex_slowlock);
if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
return ret;
}
@@ -1600,7 +1600,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
*/
void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
- mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index eef04551eae7..44e68761f432 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1504,7 +1504,7 @@ int __sched down_read_killable(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
@@ -1546,7 +1546,7 @@ int __sched down_write_killable(struct rw_semaphore *sem)
if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
__down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
@@ -1573,7 +1573,7 @@ EXPORT_SYMBOL(down_write_trylock);
*/
void up_read(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_read(sem);
}
EXPORT_SYMBOL(up_read);
@@ -1583,7 +1583,7 @@ EXPORT_SYMBOL(up_read);
*/
void up_write(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_write(sem);
}
EXPORT_SYMBOL(up_write);
@@ -1639,7 +1639,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
__down_write_killable)) {
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_release(&sem->dep_map, _RET_IP_);
return -EINTR;
}
diff --git a/kernel/module.c b/kernel/module.c
index acf7962936c4..052a40212b8e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_ro(mod, false);
module_enable_nx(mod);
- module_enable_x(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
if (err)
return err;
+ /* Make module executable after ftrace is enabled */
+ mutex_lock(&module_mutex);
+ module_enable_x(mod);
+ mutex_unlock(&module_mutex);
+
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index f470a038b05b..b69ee9e76cb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -671,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
-#ifdef CONFIG_ARCH_HAS_REFCOUNT
-void refcount_error_report(struct pt_regs *regs, const char *err)
-{
- WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
- err, (void *)instruction_pointer(regs),
- current->comm, task_pid_nr(current),
- from_kuid_munged(&init_user_ns, current_uid()),
- from_kuid_munged(&init_user_ns, current_euid()));
-}
-#endif
-
core_param(panic, panic_timeout, int, 0644);
core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 44bee462ff57..7cdc64dc2373 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t);
extern int swsusp_unmark(void);
#endif
-struct timeval;
+struct __kernel_old_timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 83105874f255..26b9168321e7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -734,8 +734,15 @@ zone_found:
* We have found the zone. Now walk the radix tree to find the leaf node
* for our PFN.
*/
+
+ /*
+ * If the zone we wish to scan is the the current zone and the
+ * pfn falls into the current node then we do not need to walk
+ * the tree.
+ */
node = bm->cur.node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ if (zone == bm->cur.zone &&
+ ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..c8be5a0f5259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -248,7 +248,7 @@ static void __up_console_sem(unsigned long ip)
{
unsigned long flags;
- mutex_release(&console_lock_dep_map, 1, ip);
+ mutex_release(&console_lock_dep_map, ip);
printk_safe_enter_irqsave(flags);
up(&console_sem);
@@ -1679,20 +1679,20 @@ static int console_lock_spinning_disable_and_check(void)
raw_spin_unlock(&console_owner_lock);
if (!waiter) {
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
return 0;
}
/* The waiter is now free to continue */
WRITE_ONCE(console_waiter, false);
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
/*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
- mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
+ mutex_release(&console_lock_dep_map, _THIS_IP_);
return 1;
}
@@ -1746,7 +1746,7 @@ static int console_trylock_spinning(void)
/* Owner will clear console_waiter on hand off */
while (READ_ONCE(console_waiter))
cpu_relax();
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+ spin_release(&console_owner_dep_map, _THIS_IP_);
printk_safe_exit_irqrestore(flags);
/*
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 8fd4f82c9b3d..ab504fbc76ca 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -299,6 +299,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
+ for (i = 0; i < RCU_NUM_LVLS; i++)
+ levelspread[i] = INT_MIN;
if (rcu_fanout_exact) {
levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
@@ -455,7 +457,6 @@ enum rcutorture_type {
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
-void rcutorture_record_progress(unsigned long vernum);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
@@ -468,7 +469,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*flags = 0;
*gp_seq = 0;
}
-static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 495c58ce1640..cbc87b804db9 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -88,7 +88,7 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
}
/* Set the length of an rcu_segcblist structure. */
-void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
+static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
atomic_long_set(&rsclp->len, v);
@@ -104,7 +104,7 @@ void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
* This increase is fully ordered with respect to the callers accesses
* both before and after.
*/
-void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
smp_mb__before_atomic(); /* Up to the caller! */
@@ -134,7 +134,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
* with the actual number of callbacks on the structure. This exchange is
* fully ordered with respect to the callers accesses both before and after.
*/
-long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
+static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
return atomic_long_xchg(&rsclp->len, v);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 5a879d073c1c..5f884d560384 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -109,15 +109,6 @@ static unsigned long b_rcu_perf_writer_started;
static unsigned long b_rcu_perf_writer_finished;
static DEFINE_PER_CPU(atomic_t, n_async_inflight);
-static int rcu_perf_writer_state;
-#define RTWS_INIT 0
-#define RTWS_ASYNC 1
-#define RTWS_BARRIER 2
-#define RTWS_EXP_SYNC 3
-#define RTWS_SYNC 4
-#define RTWS_IDLE 5
-#define RTWS_STOPPING 6
-
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -404,25 +395,20 @@ retry:
if (!rhp)
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
- rcu_perf_writer_state = RTWS_ASYNC;
atomic_inc(this_cpu_ptr(&n_async_inflight));
cur_ops->async(rhp, rcu_perf_async_cb);
rhp = NULL;
} else if (!kthread_should_stop()) {
- rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
goto retry;
} else {
kfree(rhp); /* Because we are stopping. */
}
} else if (gp_exp) {
- rcu_perf_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
} else {
- rcu_perf_writer_state = RTWS_SYNC;
cur_ops->sync();
}
- rcu_perf_writer_state = RTWS_IDLE;
t = ktime_get_mono_fast_ns();
*wdp = t - *wdp;
i_max = i;
@@ -463,10 +449,8 @@ retry:
rcu_perf_wait_shutdown();
} while (!torture_must_stop());
if (gp_async) {
- rcu_perf_writer_state = RTWS_BARRIER;
cur_ops->gp_barrier();
}
- rcu_perf_writer_state = RTWS_STOPPING;
writer_n_durations[me] = i_max;
torture_kthread_stopping("rcu_perf_writer");
return 0;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 3c9feca1eab1..dee043feb71f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -44,6 +44,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/oom.h>
+#include <linux/tick.h>
#include "rcu.h"
@@ -1363,15 +1364,15 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
-
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand))
+ if (!rcu_torture_one_read(&rand) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
- if (time_after(jiffies, lastsleep)) {
+ if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
schedule_timeout_interruptible(1);
lastsleep = jiffies + 10;
}
@@ -1383,6 +1384,7 @@ rcu_torture_reader(void *arg)
del_timer_sync(&t);
destroy_timer_on_stack(&t);
}
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@@ -1442,15 +1444,18 @@ rcu_torture_stats_print(void)
n_rcu_torture_barrier_error);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- if (atomic_read(&n_rcu_torture_mberror) != 0 ||
- n_rcu_torture_barrier_error != 0 ||
- n_rcu_torture_boost_ktrerror != 0 ||
- n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0 ||
+ if (atomic_read(&n_rcu_torture_mberror) ||
+ n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
+ n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
+ WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
+ WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
+ WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed
+ WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1729,10 +1734,10 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
// Real call_rcu() floods hit userspace, so emulate that.
if (need_resched() || (iter & 0xfff))
schedule();
- } else {
- // No userspace emulation: CB invocation throttles call_rcu()
- cond_resched();
+ return;
}
+ // No userspace emulation: CB invocation throttles call_rcu()
+ cond_resched();
}
/*
@@ -1759,6 +1764,11 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
kfree(rfcp);
freed++;
rcu_torture_fwd_prog_cond_resched(freed);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
}
return freed;
}
@@ -1803,7 +1813,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- rcu_torture_fwd_prog_cond_resched(1);
+ cond_resched();
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
@@ -1833,6 +1843,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
static void rcu_torture_fwd_prog_cr(void)
{
unsigned long cver;
+ unsigned long flags;
unsigned long gps;
int i;
long n_launders;
@@ -1865,6 +1876,7 @@ static void rcu_torture_fwd_prog_cr(void)
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
rcu_launder_gp_seq_start = gps;
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -1891,6 +1903,11 @@ static void rcu_torture_fwd_prog_cr(void)
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1911,6 +1928,7 @@ static void rcu_torture_fwd_prog_cr(void)
rcu_torture_fwd_cb_hist();
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 81105141b6a8..1694a6b57ad8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -364,7 +364,7 @@ bool rcu_eqs_special_set(int cpu)
*
* The caller must have disabled interrupts and must not be idle.
*/
-static void __maybe_unused rcu_momentary_dyntick_idle(void)
+void rcu_momentary_dyntick_idle(void)
{
int special;
@@ -375,6 +375,7 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
rcu_preempt_deferred_qs(current);
}
+EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
/**
* rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
@@ -496,7 +497,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
-static int rcu_pending(void);
+static int rcu_pending(int user);
/*
* Return the number of RCU GPs completed thus far for debug & stats.
@@ -824,6 +825,11 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
rcu_cleanup_after_idle();
incby = 1;
+ } else if (tick_nohz_full_cpu(rdp->cpu) &&
+ rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
+ READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ rdp->rcu_forced_tick = true;
+ tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
@@ -885,6 +891,21 @@ void rcu_irq_enter_irqson(void)
local_irq_restore(flags);
}
+/*
+ * If any sort of urgency was applied to the current CPU (for example,
+ * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
+ * to get to a quiescent state, disable it.
+ */
+static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
+{
+ WRITE_ONCE(rdp->rcu_urgent_qs, false);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
+ if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
+ tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ rdp->rcu_forced_tick = false;
+ }
+}
+
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
@@ -1073,6 +1094,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+ WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
@@ -1968,7 +1990,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
- rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
@@ -1979,6 +2000,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
if (!offloaded)
needwake = rcu_accelerate_cbs(rnp, rdp);
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
if (needwake)
@@ -2101,6 +2123,9 @@ int rcutree_dead_cpu(unsigned int cpu)
rcu_boost_kthread_setaffinity(rnp, -1);
/* Do any needed no-CB deferred wakeups from this CPU. */
do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
+
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@@ -2151,6 +2176,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
debug_rcu_head_unqueue(rhp);
@@ -2217,6 +2243,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Re-invoke RCU core processing if there are callbacks remaining. */
if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
/*
@@ -2241,7 +2268,7 @@ void rcu_sched_clock_irq(int user)
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
rcu_flavor_sched_clock_irq(user);
- if (rcu_pending())
+ if (rcu_pending(user))
invoke_rcu_core();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -2259,6 +2286,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
int cpu;
unsigned long flags;
unsigned long mask;
+ struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
@@ -2283,8 +2311,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
- if (f(per_cpu_ptr(&rcu_data, cpu)))
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (f(rdp)) {
mask |= bit;
+ rcu_disable_urgency_upon_qs(rdp);
+ }
}
}
if (mask != 0) {
@@ -2312,7 +2343,7 @@ void rcu_force_quiescent_state(void)
rnp = __this_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
- !raw_spin_trylock(&rnp->fqslock);
+ !raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret)
@@ -2786,8 +2817,9 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
* CPU-local state are performed first. However, we must check for CPU
* stalls first, else we might not get a chance.
*/
-static int rcu_pending(void)
+static int rcu_pending(int user)
{
+ bool gp_in_progress;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
@@ -2798,12 +2830,13 @@ static int rcu_pending(void)
if (rcu_nocb_need_deferred_wakeup(rdp))
return 1;
- /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
- if (rcu_nohz_full_cpu())
+ /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
+ if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
+ gp_in_progress = rcu_gp_in_progress();
+ if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
/* Does this CPU have callbacks ready to invoke? */
@@ -2811,8 +2844,7 @@ static int rcu_pending(void)
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
- if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) &&
+ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
!rcu_segcblist_is_offloaded(&rdp->cblist)) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
@@ -2845,7 +2877,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
{
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
rcu_barrier_trace(TPS("LastCB"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
complete(&rcu_state.barrier_completion);
} else {
rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
@@ -2869,7 +2901,7 @@ static void rcu_barrier_func(void *unused)
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
}
@@ -2896,7 +2928,7 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
@@ -2928,11 +2960,11 @@ void rcu_barrier(void)
continue;
if (rcu_segcblist_n_cbs(&rdp->cblist)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
} else {
rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ rcu_state.barrier_sequence);
}
}
put_online_cpus();
@@ -3083,6 +3115,9 @@ int rcutree_online_cpu(unsigned int cpu)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
rcutree_affinity_setting(cpu, -1);
+
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
}
@@ -3103,6 +3138,9 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
return 0;
}
@@ -3148,6 +3186,7 @@ void rcu_cpu_starting(unsigned int cpu)
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
+ rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c612f306fe89..055c31781d3a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -181,6 +181,7 @@ struct rcu_data {
atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
+ bool rcu_forced_tick; /* Forced tick to provide QS. */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 2defc7fe74c3..fa08d55f7040 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1946,7 +1946,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
int __maybe_unused cpu = my_rdp->cpu;
unsigned long cur_gp_seq;
unsigned long flags;
- bool gotcbs;
+ bool gotcbs = false;
unsigned long j = jiffies;
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 80b60ca7767f..90e4b00ace89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -811,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -854,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -919,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -3106,7 +3106,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = next;
@@ -3918,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, rf);
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, rf);
+ if (!p) {
+ put_prev_task(rq, prev);
+ p = pick_next_task_idle(rq);
+ }
return p;
}
@@ -3948,7 +3950,7 @@ restart:
put_prev_task(rq, prev);
for_each_class(class) {
- p = class->pick_next_task(rq, NULL, NULL);
+ p = class->pick_next_task(rq);
if (p)
return p;
}
@@ -6217,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
struct task_struct *next;
for_each_class(class) {
- next = class->pick_next_task(rq, NULL, NULL);
+ next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 46ed4e1383e2..d43318a489f2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
- vtime_account_system(prev);
+ vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
- vtime_account_system(tsk);
+ vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
u64 cputime, steal;
struct rq *rq = this_rq();
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
@@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk,
- struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
- /* We might have scheduled out from guest path */
- if (tsk->flags & PF_VCPU)
- vtime_account_guest(tsk, vtime);
- else
- __vtime_account_system(tsk, vtime);
+ __vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
- *utime += vtime->utime + delta;
- else if (vtime->state == VTIME_SYS)
+ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+}
+
+static int vtime_state_check(struct vtime *vtime, int cpu)
+{
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (vtime->state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (vtime->state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ struct rq *rq;
+ u64 val;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return cpustat[usage];
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (vtime->state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (vtime->state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return err;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a8a08030a8f7..43323f875cb9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static void set_next_task_dl(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+ if (!first)
+ return;
+
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p);
+ set_next_task_dl(rq, p, true);
return p;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69a81a5709ff..08a233e97a01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
}
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
@@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long cpu_runnable_load(struct rq *rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+
+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3764,10 +3769,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ if (sched_feat(UTIL_EST_FASTUP)) {
+ if (ue.ewma < ue.enqueued) {
+ ue.ewma = ue.enqueued;
+ goto done;
+ }
+ }
+
+ /*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
@@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
WRITE_ONCE(p->se.avg.util_est, ue);
}
@@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu)
rq->nr_running);
}
-static unsigned long cpu_runnable_load(struct rq *rq)
+static unsigned long cpu_load(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&rq->cfs);
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cpu_rq(cpu)->cpu_capacity;
-}
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = cpu_runnable_load(rq);
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
- if (nr_running)
- return load_avg / nr_running;
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
- return 0;
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
@@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
-{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX;
- unsigned long this_runnable_load = ULONG_MAX;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
- unsigned long most_spare = 0, this_spare = 0;
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
- do {
- unsigned long load, avg_load, runnable_load;
- unsigned long spare_cap, max_spare_cap;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_span(group),
- p->cpus_ptr))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
- /*
- * Tally up the load of all CPUs in the group and find
- * the group containing the CPU with most spare capacity.
- */
- avg_load = 0;
- runnable_load = 0;
- max_spare_cap = 0;
-
- for_each_cpu(i, sched_group_span(group)) {
- load = cpu_runnable_load(cpu_rq(i));
- runnable_load += load;
-
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
- spare_cap = capacity_spare_without(i, p);
-
- if (spare_cap > max_spare_cap)
- max_spare_cap = spare_cap;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
-
- if (local_group) {
- this_runnable_load = runnable_load;
- this_avg_load = avg_load;
- this_spare = max_spare_cap;
- } else {
- if (min_runnable_load > (runnable_load + imbalance)) {
- /*
- * The runnable load is significantly smaller
- * so we can pick this new CPU:
- */
- min_runnable_load = runnable_load;
- min_avg_load = avg_load;
- idlest = group;
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
- (100*min_avg_load > imbalance_scale*avg_load)) {
- /*
- * The runnable loads are close so take the
- * blocked load into account through avg_load:
- */
- min_avg_load = avg_load;
- idlest = group;
- }
-
- if (most_spare < max_spare_cap) {
- most_spare = max_spare_cap;
- most_spare_sg = group;
- }
- }
- } while (group = group->next, group != sd->groups);
-
- /*
- * The cross-over point between using spare capacity or least load
- * is too conservative for high utilization tasks on partially
- * utilized systems if we require spare_capacity > task_util(p),
- * so we allow for some task stuffing by using
- * spare_capacity > task_util(p)/2.
- *
- * Spare capacity can't be used for fork because the utilization has
- * not been set yet, we must first select a rq to compute the initial
- * utilization.
- */
- if (sd_flag & SD_BALANCE_FORK)
- goto skip_spare;
-
- if (this_spare > task_util(p) / 2 &&
- imbalance_scale*this_spare > 100*most_spare)
- return NULL;
-
- if (most_spare > task_util(p) / 2)
- return most_spare_sg;
-
-skip_spare:
- if (!idlest)
- return NULL;
-
- /*
- * When comparing groups across NUMA domains, it's possible for the
- * local domain to be very lightly loaded relative to the remote
- * domains but "imbalance" skews the comparison making remote CPUs
- * look much more favourable. When considering cross-domain, add
- * imbalance to the runnable load on the remote node and consider
- * staying local.
- */
- if ((sd->flags & SD_NUMA) &&
- min_runnable_load + imbalance >= this_runnable_load)
- return NULL;
-
- if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
-
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
- (100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
-
- return idlest;
-}
+ int this_cpu, int sd_flag);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
continue;
}
- load = cpu_runnable_load(cpu_rq(i));
+ load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_without, sync it up to
+ * We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
@@ -6746,7 +6642,7 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *
+struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6890,6 +6786,11 @@ idle:
return NULL;
}
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+ return pick_next_task_fair(rq, NULL, NULL);
+}
+
/*
* Account for a descheduled task:
*/
@@ -7079,11 +6980,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
enum group_type {
- group_other = 0,
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+ * and must be migrated to a more powerful CPU.
+ */
group_misfit_task,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
group_imbalanced,
- group_overloaded,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
};
#define LBF_ALL_PINNED 0x01
@@ -7116,7 +7055,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
- enum group_type src_grp_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -7339,7 +7278,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7347,8 +7286,8 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7381,19 +7320,46 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ switch (env->migration_type) {
+ case migrate_load:
+ load = task_h_load(p);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ if (load/2 > env->imbalance)
+ goto next;
+
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (util > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
#ifdef CONFIG_PREEMPTION
/*
@@ -7407,7 +7373,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * runnable load.
+ * load/util/tasks.
*/
if (env->imbalance <= 0)
break;
@@ -7517,6 +7483,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7536,29 +7524,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
-
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7567,9 +7537,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7584,11 +7558,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- update_blocked_load_status(rq, !done);
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7638,29 +7611,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
-
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
+ bool decayed;
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7669,6 +7629,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -7677,14 +7655,14 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
- unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_no_capacity;
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -7699,10 +7677,10 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
- unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
@@ -7713,19 +7691,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
- * We must however clear busiest_stat::avg_load because
- * update_sd_pick_busiest() reads this before assignment.
+ * We must however set busiest_stat::group_type and
+ * busiest_stat::idle_cpus to the worst busiest group because
+ * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
- .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
- .avg_load = 0UL,
- .sum_nr_running = 0,
- .group_type = group_other,
+ .idle_cpus = UINT_MAX,
+ .group_type = group_has_spare,
},
};
}
@@ -7913,13 +7890,13 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * 100) >
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7934,13 +7911,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
* false.
*/
static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7967,19 +7944,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
}
static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(unsigned int imbalance_pct,
+ struct sched_group *group,
struct sg_lb_stats *sgs)
{
- if (sgs->group_no_capacity)
+ if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_asym_packing)
+ return group_asym_packing;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
- return group_other;
+ if (!group_has_capacity(imbalance_pct, sgs))
+ return group_fully_busy;
+
+ return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
@@ -8016,21 +8000,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running;
+ int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- sgs->group_load += cpu_runnable_load(rq);
+ sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
+ sgs->sum_nr_running += nr_running;
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -8044,9 +8032,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/*
* No need to call idle_cpu() if nr_running is not 0
*/
- if (!nr_running && idle_cpu(i))
+ if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
+ /* Idle cpu can't have misfit task */
+ continue;
+ }
+ if (local_group)
+ continue;
+
+ /* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8054,17 +8049,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE &&
+ sgs->sum_h_nr_running &&
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+ sgs->group_asym_packing = 1;
+ }
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
+ sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+ /* Computing avg_load makes sense only when group is overloaded */
+ if (sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
}
/**
@@ -8087,6 +8089,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /* Make sure that there is at least one task to pull */
+ if (!sgs->sum_h_nr_running)
+ return false;
+
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
@@ -8095,7 +8101,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
- !group_has_capacity(env, &sds->local_stat)))
+ sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
@@ -8104,62 +8110,88 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
-
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
-
/*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
+ * The candidate and the current busiest group are the same type of
+ * group. Let check which one is the busiest according to the type.
*/
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_min_cpu_capacity(sds->local, sg))
- return false;
- /*
- * If we have more than one misfit sg go with the biggest misfit.
- */
- if (sgs->group_type == group_misfit_task &&
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ switch (sgs->group_type) {
+ case group_overloaded:
+ /* Select the overloaded group with highest avg_load. */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ /*
+ * Select the 1st imbalanced group as we don't have any way to
+ * choose one more than another.
+ */
return false;
-asym_packing:
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
+ case group_asym_packing:
+ /* Prefer to move from lowest priority CPU's work */
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+ return false;
+ break;
- /* No ASYM_PACKING if target CPU is already busy */
- if (env->idle == CPU_NOT_IDLE)
- return true;
- /*
- * ASYM_PACKING needs to move all the work to the highest
- * prority CPUs in the group, therefore mark all groups
- * of lower priority than ourself as busy.
- */
- if (sgs->sum_nr_running &&
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
- if (!sds->busiest)
- return true;
+ case group_misfit_task:
+ /*
+ * If we have more than one misfit sg go with the biggest
+ * misfit.
+ */
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ return false;
+ break;
- /* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
- sg->asym_prefer_cpu))
- return true;
+ case group_fully_busy:
+ /*
+ * Select the fully busy group with highest avg_load. In
+ * theory, there is no need to pull task from such kind of
+ * group because tasks have all compute capacity that they need
+ * but we can still improve the overall throughput by reducing
+ * contention when accessing shared HW resources.
+ *
+ * XXX for now avg_load is not computed and always 0 so we
+ * select the 1st one.
+ */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_has_spare:
+ /*
+ * Select not overloaded group with lowest number of
+ * idle cpus. We could also compare the spare capacity
+ * which is more stable but it can end up that the
+ * group has less spare capacity but finally more idle
+ * CPUs which means less opportunity to pull tasks.
+ */
+ if (sgs->idle_cpus >= busiest->idle_cpus)
+ return false;
+ break;
}
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and has higher
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+ * throughput. Maximize throughput, power/energy consequences are not
+ * considered.
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type <= group_fully_busy) &&
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
+ return false;
+
+ return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
@@ -8184,18 +8216,310 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return 0;
+
+ if (task_on_rq_queued(p))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
+ */
+static int idle_cpu_without(int cpu, struct task_struct *p)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle && rq->curr != p)
+ return 0;
+
+ /*
+ * rq->nr_running can't be used but an updated version without the
+ * impact of p on cpu must be used instead. The updated nr_running
+ * be computed and tested before calling idle_cpu_without().
+ */
+
+#ifdef CONFIG_SMP
+ if (!llist_empty(&rq->wake_list))
+ return 0;
+#endif
+
+ return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ struct task_struct *p)
+{
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu(i, sched_group_span(group)) {
+ struct rq *rq = cpu_rq(i);
+ unsigned int local;
+
+ sgs->group_load += cpu_load_without(rq, p);
+ sgs->group_util += cpu_util_without(i, p);
+ local = task_running_on_cpu(i, p);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+
+ nr_running = rq->nr_running - local;
+ sgs->sum_nr_running += nr_running;
+
+ /*
+ * No need to call idle_cpu_without() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu_without(i, p))
+ sgs->idle_cpus++;
+
+ }
+
+ /* Check if task fits in the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
+ sgs->group_misfit_task_load = 1;
+ }
+
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+ /*
+ * Computing avg_load makes sense only when group is fully busy or
+ * overloaded
+ */
+ if (sgs->group_type < group_fully_busy)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+ struct sg_lb_stats *idlest_sgs,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_type < idlest_sgs->group_type)
+ return true;
+
+ if (sgs->group_type > idlest_sgs->group_type)
+ return false;
+
+ /*
+ * The candidate and the current idlest group are the same type of
+ * group. Let check which one is the idlest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /* Select the group with lowest avg_load. */
+ if (idlest_sgs->avg_load <= sgs->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those types are not used in the slow wakeup path */
+ return false;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ return false;
+ break;
+
+ case group_has_spare:
+ /* Select group with most idle CPUs */
+ if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * find_idlest_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int sd_flag)
+{
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+ struct sg_lb_stats local_sgs, tmp_sgs;
+ struct sg_lb_stats *sgs;
+ unsigned long imbalance;
+ struct sg_lb_stats idlest_sgs = {
+ .avg_load = UINT_MAX,
+ .group_type = group_overloaded,
+ };
+
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
+ do {
+ int local_group;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ p->cpus_ptr))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
+ if (local_group) {
+ sgs = &local_sgs;
+ local = group;
+ } else {
+ sgs = &tmp_sgs;
+ }
+
+ update_sg_wakeup_stats(sd, group, sgs, p);
+
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+ idlest = group;
+ idlest_sgs = *sgs;
+ }
+
+ } while (group = group->next, group != sd->groups);
+
+
+ /* There is no idlest group to push tasks to */
+ if (!idlest)
+ return NULL;
+
+ /*
+ * If the local group is idler than the selected idlest group
+ * don't try and push the task.
+ */
+ if (local_sgs.group_type < idlest_sgs.group_type)
+ return NULL;
+
+ /*
+ * If the local group is busier than the selected idlest group
+ * try and push the task.
+ */
+ if (local_sgs.group_type > idlest_sgs.group_type)
+ return idlest;
+
+ switch (local_sgs.group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /*
+ * When comparing groups across NUMA domains, it's possible for
+ * the local domain to be very lightly loaded relative to the
+ * remote domains but "imbalance" skews the comparison making
+ * remote CPUs look much more favourable. When considering
+ * cross-domain, add imbalance to the load on the remote node
+ * and consider staying local.
+ */
+
+ if ((sd->flags & SD_NUMA) &&
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+ return NULL;
+
+ /*
+ * If the local group is less loaded than the selected
+ * idlest group don't try and push any tasks.
+ */
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+ return NULL;
+
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+ return NULL;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those type are not used in the slow wakeup path */
+ return NULL;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ return NULL;
+ break;
+
+ case group_has_spare:
+ if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+ int idlest_cpu;
+ /*
+ * If there is spare capacity at NUMA, try to select
+ * the preferred node
+ */
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+ return NULL;
+
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+ return idlest;
+#endif
+ /*
+ * Otherwise, keep the task on this node to stay close
+ * its wakeup source and improve locality. If there is
+ * a real need of migration, periodic load balance will
+ * take care of it.
+ */
+ if (local_sgs.idle_cpus)
+ return NULL;
+ }
+
+ /*
+ * Select group with highest number of idle CPUs. We could also
+ * compare the utilization which is more stable but it can end
+ * up that the group has less spare capacity but finally more
+ * idle CPUs which means more opportunity to run task.
+ */
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+ return NULL;
+ break;
+ }
+
+ return idlest;
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
+
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
@@ -8222,22 +8546,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (local_group)
goto next_group;
- /*
- * In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity so that we'll try
- * and move all the excess tasks away. We lower the capacity
- * of a group only if the local group has the capacity to fit
- * these excess tasks. The extra check prevents the case where
- * you always pull from the heaviest group when it is already
- * under-utilized (possible with a large weight task outweighs
- * the tasks on the system).
- */
- if (prefer_sibling && sds->local &&
- group_has_capacity(env, local) &&
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
- sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
- }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -8246,13 +8554,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
- sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
+ /* Tag domain that child domain prefers tasks go to siblings first */
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8283,203 +8593,160 @@ next_group:
}
/**
- * check_asym_packing - Check to see if the group is packed into the
- * sched domain.
- *
- * This is primarily intended to used at the sibling level. Some
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle. When in lower SMT modes, the threads will
- * perform better since they share less core resources. Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads. It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on. Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in env->imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-{
- int busiest_cpu;
-
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
-
- if (env->idle == CPU_NOT_IDLE)
- return 0;
-
- if (!sds->busiest)
- return 0;
-
- busiest_cpu = sds->busiest->asym_prefer_cpu;
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
- return 0;
-
- env->imbalance = sds->busiest_stat.group_load;
-
- return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- * amongst the groups of a sched_domain, during
- * load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, capa_now = 0, capa_move = 0;
- unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (!local->sum_nr_running)
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
- else if (busiest->load_per_task > local->load_per_task)
- imbn = 1;
+ if (busiest->group_type == group_misfit_task) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ return;
+ }
- scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- busiest->group_capacity;
+ if (busiest->group_type == group_asym_packing) {
+ /*
+ * In case of asym capacity, we will try to migrate all load to
+ * the preferred CPU.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = busiest->sum_h_nr_running;
+ return;
+ }
- if (busiest->avg_load + scaled_busy_load_per_task >=
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
- env->imbalance = busiest->load_per_task;
+ if (busiest->group_type == group_imbalanced) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure CPU-load equilibrium, try to move any task to fix
+ * the imbalance. The next load balance will take care of
+ * balancing back the system.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
return;
}
/*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU capacity used by
- * moving them.
+ * Try to use spare capacity of local group without overloading it or
+ * emptying busiest.
+ * XXX Spreading tasks across NUMA nodes is not always the best policy
+ * and special care should be taken for SD_NUMA domain level before
+ * spreading the tasks. For now, load_balance() fully relies on
+ * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
+ if (local->group_type == group_has_spare) {
+ if (busiest->group_type > group_fully_busy) {
+ /*
+ * If busiest is overloaded, try to fill spare
+ * capacity. This might end up creating spare capacity
+ * in busiest or busiest still being overloaded but
+ * there is no simple way to directly compute the
+ * amount of load to migrate in order to balance the
+ * system.
+ */
+ env->migration_type = migrate_util;
+ env->imbalance = max(local->group_capacity, local->group_util) -
+ local->group_util;
- capa_now += busiest->group_capacity *
- min(busiest->load_per_task, busiest->avg_load);
- capa_now += local->group_capacity *
- min(local->load_per_task, local->avg_load);
- capa_now /= SCHED_CAPACITY_SCALE;
-
- /* Amount of load we'd subtract */
- if (busiest->avg_load > scaled_busy_load_per_task) {
- capa_move += busiest->group_capacity *
- min(busiest->load_per_task,
- busiest->avg_load - scaled_busy_load_per_task);
- }
-
- /* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_capacity <
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
- tmp = (busiest->avg_load * busiest->group_capacity) /
- local->group_capacity;
- } else {
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- local->group_capacity;
- }
- capa_move += local->group_capacity *
- min(local->load_per_task, local->avg_load + tmp);
- capa_move /= SCHED_CAPACITY_SCALE;
-
- /* Move if we gain throughput */
- if (capa_move > capa_now)
- env->imbalance = busiest->load_per_task;
-}
+ /*
+ * In some cases, the group's utilization is max or even
+ * higher than capacity because of migrations but the
+ * local CPU is (newly) idle. There is at least one
+ * waiting task in this overloaded busiest group. Let's
+ * try to pull it.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ }
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- * groups of a given sched_domain during load balance.
- * @env: load balance environment
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
- unsigned long max_pull, load_above_capacity = ~0UL;
- struct sg_lb_stats *local, *busiest;
+ return;
+ }
- local = &sds->local_stat;
- busiest = &sds->busiest_stat;
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
+ unsigned int nr_diff = busiest->sum_nr_running;
+ /*
+ * When prefer sibling, evenly spread running tasks on
+ * groups.
+ */
+ env->migration_type = migrate_task;
+ lsub_positive(&nr_diff, local->sum_nr_running);
+ env->imbalance = nr_diff >> 1;
+ return;
+ }
- if (busiest->group_type == group_imbalanced) {
/*
- * In the group_imb case we cannot rely on group-wide averages
- * to ensure CPU-load equilibrium, look at wider averages. XXX
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
*/
- busiest->load_per_task =
- min(busiest->load_per_task, sds->avg_load);
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
+ busiest->idle_cpus) >> 1);
+ return;
}
/*
- * Avg load of busiest sg can be less and avg load of local sg can
- * be greater than avg load across all sgs of sd because avg load
- * factors in sg capacity and sgs with smaller group_type are
- * skipped when updating the busiest sg:
+ * Local is fully busy but has to take more load to relieve the
+ * busiest group
*/
- if (busiest->group_type != group_misfit_task &&
- (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load)) {
- env->imbalance = 0;
- return fix_small_imbalance(env, sds);
- }
+ if (local->group_type < group_overloaded) {
+ /*
+ * Local will become overloaded so the avg_load metrics are
+ * finally needed.
+ */
- /*
- * If there aren't any idle CPUs, avoid creating some.
- */
- if (busiest->group_type == group_overloaded &&
- local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
- if (load_above_capacity > busiest->group_capacity) {
- load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
- load_above_capacity /= busiest->group_capacity;
- } else
- load_above_capacity = ~0UL;
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
- * We're trying to get all the CPUs to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded CPU below the average load. At the same time,
- * we also don't want to reduce the group load below the group
- * capacity. Thus we look for the minimum possible imbalance.
+ * Both group are or will become overloaded and we're trying to get all
+ * the CPUs to the average_load, so we don't want to push ourselves
+ * above the average load, nor do we wish to reduce the max loaded CPU
+ * below the average load. At the same time, we also don't want to
+ * reduce the group load below the group capacity. Thus we look for
+ * the minimum possible imbalance.
*/
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
- /* How much load to actually move to equalise the imbalance */
+ env->migration_type = migrate_load;
env->imbalance = min(
- max_pull * busiest->group_capacity,
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
-
- /* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = max_t(long, env->imbalance,
- busiest->group_misfit_task_load);
- }
-
- /*
- * if *imbalance is less than the average load per runnable task
- * there is no guarantee that any tasks will be moved so we'll have
- * a think about bumping its value to force at least one task to be
- * moved
- */
- if (env->imbalance < busiest->load_per_task)
- return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
+/*
+ * Decision matrix according to the local and busiest group type:
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare nr_idle balanced N/A N/A balanced balanced
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
+ * misfit_task force N/A N/A N/A force force
+ * asym_packing force force N/A N/A force force
+ * imbalanced force force N/A N/A force force
+ * overloaded force force N/A N/A force avg_load
+ *
+ * N/A : Not Applicable because already filtered while updating
+ * statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force : Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
+ * different in groups.
+ */
+
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
@@ -8499,7 +8766,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
init_sd_lb_stats(&sds);
/*
- * Compute the various statistics relavent for load balancing at
+ * Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
@@ -8514,17 +8781,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- if (check_asym_packing(env, &sds))
- return sds.busiest;
-
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || busiest->sum_nr_running == 0)
+ if (!sds.busiest)
goto out_balanced;
- /* XXX broken for overlapping NUMA groups */
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
- / sds.total_capacity;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
@@ -8535,55 +8802,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/*
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
- * capacities from resulting in underutilization due to avg_load.
- */
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
-
- /* Misfit tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task)
- goto force_balance;
-
- /*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
- if (local->avg_load >= busiest->avg_load)
+ if (local->group_type > busiest->group_type)
goto out_balanced;
/*
- * Don't pull any tasks if this group is already above the domain
- * average load.
+ * When groups are overloaded, use the avg_load to ensure fairness
+ * between tasks.
*/
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
+ if (local->group_type == group_overloaded) {
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+ sds.total_capacity;
- if (env->idle == CPU_IDLE) {
/*
- * This CPU is idle. If the busiest group is not overloaded
- * and there is no imbalance between this and busiest group
- * wrt idle CPUs, it is balanced. The imbalance becomes
- * significant if the diff is greater than 1 otherwise we
- * might end up to just move the imbalance on another group
+ * Don't pull any tasks if this group is already above the
+ * domain average load.
*/
- if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- } else {
+
/*
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
- * imbalance_pct to be conservative.
+ * If the busiest group is more loaded, use imbalance_pct to be
+ * conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
+ /* Try to move all excess tasks to child's sibling domain */
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ busiest->sum_nr_running > local->sum_nr_running + 1)
+ goto force_balance;
+
+ if (busiest->group_type != group_overloaded) {
+ if (env->idle == CPU_NOT_IDLE)
+ /*
+ * If the busiest group is not overloaded (and as a
+ * result the local one too) but this CPU is already
+ * busy, let another idle CPU try to pull task.
+ */
+ goto out_balanced;
+
+ if (busiest->group_weight > 1 &&
+ local->idle_cpus <= (busiest->idle_cpus + 1))
+ /*
+ * If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest
+ * group wrt idle CPUs, it is balanced. The imbalance
+ * becomes significant if the diff is greater than 1
+ * otherwise we might end up to just move the imbalance
+ * on another group. Of course this applies only if
+ * there is more than 1 CPU per group.
+ */
+ goto out_balanced;
+
+ if (busiest->sum_h_nr_running == 1)
+ /*
+ * busiest doesn't have any tasks waiting to run
+ */
+ goto out_balanced;
+ }
+
force_balance:
/* Looks like there is an imbalance. Compute it */
- env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8599,11 +8891,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_capacity = 1;
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int busiest_nr = 0;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, load;
+ unsigned long capacity, load, util;
+ unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8631,20 +8925,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
- * seek the "biggest" misfit task.
- */
- if (env->src_grp_type == group_misfit_task) {
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- continue;
- }
-
capacity = capacity_of(i);
+ nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -8654,35 +8936,69 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
- rq->nr_running == 1)
+ nr_running == 1)
continue;
- load = cpu_runnable_load(rq);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * When comparing with load imbalance, use cpu_load()
+ * which is not scaled with the CPU capacity.
+ */
+ load = cpu_load(rq);
- /*
- * When comparing with imbalance, use cpu_runnable_load()
- * which is not scaled with the CPU capacity.
- */
+ if (nr_running == 1 && load > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ break;
- if (rq->nr_running == 1 && load > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
- continue;
+ /*
+ * For the load comparisons with the other CPUs,
+ * consider the cpu_load() scaled with the CPU
+ * capacity, so that the load can be moved away
+ * from the CPU that is potentially running at a
+ * lower capacity.
+ *
+ * Thus we're looking for max(load_i / capacity_i),
+ * crosswise multiplication to rid ourselves of the
+ * division works out to:
+ * load_i * capacity_j > load_j * capacity_i;
+ * where j is our previous maximum.
+ */
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_util:
+ util = cpu_util(cpu_of(rq));
+
+ if (busiest_util < util) {
+ busiest_util = util;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_task:
+ if (busiest_nr < nr_running) {
+ busiest_nr = nr_running;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_misfit:
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
+ * simply seek the "biggest" misfit task.
+ */
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ break;
- /*
- * For the load comparisons with the other CPU's, consider
- * the cpu_runnable_load() scaled with the CPU capacity, so
- * that the load can be moved away from the CPU that is
- * potentially running at a lower capacity.
- *
- * Thus we're looking for max(load_i / capacity_i), crosswise
- * multiplication to rid ourselves of the division works out
- * to: load_i * capacity_j > load_j * capacity_i; where j is
- * our previous maximum.
- */
- if (load * busiest_capacity > busiest_load * capacity) {
- busiest_load = load;
- busiest_capacity = capacity;
- busiest = rq;
}
}
@@ -8728,7 +9044,7 @@ voluntary_active_balance(struct lb_env *env)
return 1;
}
- if (env->src_grp_type == group_misfit_task)
+ if (env->migration_type == migrate_misfit)
return 1;
return 0;
@@ -9757,6 +10073,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ * < 0 - we released the lock and there are !fair tasks present
+ * 0 - failed, no new tasks
+ * > 0 - success, new (fair) tasks present
*/
int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
@@ -10151,7 +10472,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_next_task_fair(struct rq *rq, struct task_struct *p)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
@@ -10433,7 +10754,7 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
- .pick_next_task = pick_next_task_fair,
+ .pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 2410db5e9a35..7481cd96f391 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
+SCHED_FEAT(UTIL_EST_FASTUP, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f65ef1e2f204..ffa959e91227 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
* update no idle residency and return.
*/
if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
+ dev->last_residency_ns = 0;
local_irq_enable();
return -EBUSY;
}
@@ -165,7 +165,9 @@ static void cpuidle_idle_call(void)
* until a proper wakeup interrupt happens.
*/
- if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+ u64 max_latency_ns;
+
if (idle_should_enter_s2idle()) {
rcu_idle_enter();
@@ -176,12 +178,16 @@ static void cpuidle_idle_call(void)
}
rcu_idle_exit();
+
+ max_latency_ns = U64_MAX;
+ } else {
+ max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
rcu_idle_enter();
- next_state = cpuidle_find_deepest_state(drv, dev);
+ next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
} else {
bool stop_tick = true;
@@ -311,7 +317,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void play_idle(unsigned long duration_us)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
{
struct idle_timer it;
@@ -323,29 +329,29 @@ void play_idle(unsigned long duration_us)
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
- WARN_ON_ONCE(!duration_us);
+ WARN_ON_ONCE(!duration_ns);
rcu_sleep_check();
preempt_disable();
current->flags |= PF_IDLE;
- cpuidle_use_deepest_state(true);
+ cpuidle_use_deepest_state(latency_ns);
it.done = 0;
hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+ hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
HRTIMER_MODE_REL_PINNED);
while (!READ_ONCE(it.done))
do_idle();
- cpuidle_use_deepest_state(false);
+ cpuidle_use_deepest_state(0);
current->flags &= ~PF_IDLE;
preempt_fold_need_resched();
preempt_enable();
}
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
@@ -385,21 +391,17 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
-static void set_next_task_idle(struct rq *rq, struct task_struct *next)
+static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
- if (prev)
- put_prev_task(rq, prev);
-
- set_next_task_idle(rq, next);
+ set_next_task_idle(rq, next, true);
return next;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9b8adc01be3d..e591d40fd645 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1515,13 +1515,16 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+ if (!first)
+ return;
+
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
@@ -1564,18 +1567,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_rt_runnable(rq))
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p);
+ set_next_task_rt(rq, p, true);
return p;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8870c5bd7df..280a3c735935 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1713,22 +1713,10 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
- /*
- * Both @prev and @rf are optional and may be NULL, in which case the
- * caller must already have invoked put_prev_task(rq, prev, rf).
- *
- * Otherwise it is the responsibility of the pick_next_task() to call
- * put_prev_task() on the @prev task or something equivalent, IFF it
- * returns a next task.
- *
- * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
- * higher prio class has runnable tasks.
- */
- struct task_struct * (*pick_next_task)(struct rq *rq,
- struct task_struct *prev,
- struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq);
+
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
- void (*set_next_task)(struct rq *rq, struct task_struct *p);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@@ -1780,7 +1768,7 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != next);
- next->sched_class->set_next_task(rq, next);
+ next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
@@ -1821,6 +1809,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+extern struct task_struct *pick_next_task_idle(struct rq *rq);
+
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
@@ -2309,7 +2300,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c0640739e05e..4c9e9975684f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -29,20 +29,17 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_stop(struct rq *rq)
{
- WARN_ON_ONCE(prev || rf);
-
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop);
+ set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 49b835f1305f..6ec1e595b1d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd,
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
- else
- request = default_relax_domain_level;
+ request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
- if (request < sd->level) {
+
+ if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* Turn on idle balance on this domain: */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..ba059fbfc53a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
* @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
@@ -183,26 +182,44 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
* accessing the task state.
*/
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+ void *key)
{
- int wake_flags = 1; /* XXX WF_SYNC */
-
if (unlikely(!wq_head))
return;
- if (unlikely(nr_exclusive != 1))
- wake_flags = 0;
-
- __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+ __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/**
+ * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs in that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key)
+{
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
+
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
{
- __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+ __wake_up_sync_key(wq_head, mode, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..12d2227e5786 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -75,6 +75,7 @@ struct seccomp_knotif {
/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
int error;
long val;
+ u32 flags;
/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
struct completion ready;
@@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
-static void seccomp_do_user_notification(int this_syscall,
- struct seccomp_filter *match,
- const struct seccomp_data *sd)
+static int seccomp_do_user_notification(int this_syscall,
+ struct seccomp_filter *match,
+ const struct seccomp_data *sd)
{
int err;
+ u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
@@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall,
if (err == 0) {
ret = n.val;
err = n.error;
+ flags = n.flags;
}
/*
@@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall,
list_del(&n.list);
out:
mutex_unlock(&match->notify_lock);
+
+ /* Userspace requests to continue the syscall. */
+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return 0;
+
syscall_set_return_value(current, task_pt_regs(current),
err, ret);
+ return -1;
}
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
@@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_USER_NOTIF:
- seccomp_do_user_notification(this_syscall, match, sd);
- goto skip;
+ if (seccomp_do_user_notification(this_syscall, match, sd))
+ goto skip;
+
+ return 0;
case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
@@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (copy_from_user(&resp, buf, sizeof(resp)))
return -EFAULT;
- if (resp.flags)
+ if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return -EINVAL;
+
+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
+ (resp.error || resp.val))
return -EINVAL;
ret = mutex_lock_interruptible(&filter->notify_lock);
@@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->state = SECCOMP_NOTIFY_REPLIED;
knotif->error = resp.error;
knotif->val = resp.val;
+ knotif->flags = resp.flags;
complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index c9ea7eb2cb1a..2af66e449aa6 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -142,7 +142,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
.store = store,
.size = size,
/* skip this function if they are tracing us */
- .skip = skipnr + !!(current == tsk),
+ .skip = skipnr + (current == tsk),
};
if (!try_get_task_stack(tsk))
@@ -300,7 +300,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
.entries = store,
.max_entries = size,
/* skip this function if they are tracing us */
- .skip = skipnr + !!(current == task),
+ .skip = skipnr + (current == task),
};
save_stack_trace_tsk(task, &trace);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 998d50ee2d9b..1fe34a9fabc2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -235,6 +235,7 @@ static int multi_cpu_stop(void *data)
*/
touch_nmi_watchdog();
}
+ rcu_momentary_dyntick_idle();
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
diff --git a/kernel/sys.c b/kernel/sys.c
index a611d1d58c7d..d3aef31e24dc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1763,8 +1763,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
unlock_task_sighand(p, &flags);
out:
- r->ru_utime = ns_to_timeval(utime);
- r->ru_stime = ns_to_timeval(stime);
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 34b76895b81e..3b69a560a7ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -410,6 +410,29 @@ COND_SYSCALL(send);
COND_SYSCALL(bdflush);
COND_SYSCALL(uselib);
+/* optional: time32 */
+COND_SYSCALL(time32);
+COND_SYSCALL(stime32);
+COND_SYSCALL(utime32);
+COND_SYSCALL(adjtimex_time32);
+COND_SYSCALL(sched_rr_get_interval_time32);
+COND_SYSCALL(nanosleep_time32);
+COND_SYSCALL(rt_sigtimedwait_time32);
+COND_SYSCALL_COMPAT(rt_sigtimedwait_time32);
+COND_SYSCALL(timer_settime32);
+COND_SYSCALL(timer_gettime32);
+COND_SYSCALL(clock_settime32);
+COND_SYSCALL(clock_gettime32);
+COND_SYSCALL(clock_getres_time32);
+COND_SYSCALL(clock_nanosleep_time32);
+COND_SYSCALL(utimes_time32);
+COND_SYSCALL(futimesat_time32);
+COND_SYSCALL(pselect6_time32);
+COND_SYSCALL_COMPAT(pselect6_time32);
+COND_SYSCALL(ppoll_time32);
+COND_SYSCALL_COMPAT(ppoll_time32);
+COND_SYSCALL(utimensat_time32);
+COND_SYSCALL(clock_adjtime32);
/*
* The syscalls below are not found in include/uapi/asm-generic/unistd.h
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6f2f35d0bcf..70665934d53e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = {
.procname = "drop_caches",
.data = &sysctl_drop_caches,
.maxlen = sizeof(int),
- .mode = 0644,
+ .mode = 0200,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = &four,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 73c132095a7b..7d550cc76a3b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -18,1317 +18,12 @@
#include <linux/slab.h>
#include <linux/compat.h>
-#ifdef CONFIG_SYSCTL_SYSCALL
-
-struct bin_table;
-typedef ssize_t bin_convert_t(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
-
-static bin_convert_t bin_dir;
-static bin_convert_t bin_string;
-static bin_convert_t bin_intvec;
-static bin_convert_t bin_ulongvec;
-static bin_convert_t bin_uuid;
-static bin_convert_t bin_dn_node_address;
-
-#define CTL_DIR bin_dir
-#define CTL_STR bin_string
-#define CTL_INT bin_intvec
-#define CTL_ULONG bin_ulongvec
-#define CTL_UUID bin_uuid
-#define CTL_DNADR bin_dn_node_address
-
-#define BUFSZ 256
-
-struct bin_table {
- bin_convert_t *convert;
- int ctl_name;
- const char *procname;
- const struct bin_table *child;
-};
-
-static const struct bin_table bin_random_table[] = {
- { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
- { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
- { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
- { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
- { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
- { CTL_UUID, RANDOM_UUID, "uuid" },
- {}
-};
-
-static const struct bin_table bin_pty_table[] = {
- { CTL_INT, PTY_MAX, "max" },
- { CTL_INT, PTY_NR, "nr" },
- {}
-};
-
-static const struct bin_table bin_kern_table[] = {
- { CTL_STR, KERN_OSTYPE, "ostype" },
- { CTL_STR, KERN_OSRELEASE, "osrelease" },
- /* KERN_OSREV not used */
- { CTL_STR, KERN_VERSION, "version" },
- /* KERN_SECUREMASK not used */
- /* KERN_PROF not used */
- { CTL_STR, KERN_NODENAME, "hostname" },
- { CTL_STR, KERN_DOMAINNAME, "domainname" },
-
- { CTL_INT, KERN_PANIC, "panic" },
- { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
-
- { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
- { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
- { CTL_INT, KERN_PRINTK, "printk" },
-
- /* KERN_NAMETRANS not used */
- /* KERN_PPC_HTABRECLAIM not used */
- /* KERN_PPC_ZEROPAGED not used */
- { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
-
- { CTL_STR, KERN_MODPROBE, "modprobe" },
- { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
- { CTL_INT, KERN_ACCT, "acct" },
- /* KERN_PPC_L2CR "l2cr" no longer used */
-
- /* KERN_RTSIGNR not used */
- /* KERN_RTSIGMAX not used */
-
- { CTL_ULONG, KERN_SHMMAX, "shmmax" },
- { CTL_INT, KERN_MSGMAX, "msgmax" },
- { CTL_INT, KERN_MSGMNB, "msgmnb" },
- /* KERN_MSGPOOL not used*/
- { CTL_INT, KERN_SYSRQ, "sysrq" },
- { CTL_INT, KERN_MAX_THREADS, "threads-max" },
- { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
- { CTL_ULONG, KERN_SHMALL, "shmall" },
- { CTL_INT, KERN_MSGMNI, "msgmni" },
- { CTL_INT, KERN_SEM, "sem" },
- { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
- { CTL_INT, KERN_SHMMNI, "shmmni" },
-
- { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
- { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
-
- { CTL_STR, KERN_HOTPLUG, "hotplug", },
- { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
-
- { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
- { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
- /* KERN_TAINTED "tainted" no longer used */
- { CTL_INT, KERN_CADPID, "cad_pid" },
- { CTL_INT, KERN_PIDMAX, "pid_max" },
- { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
- { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
- { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
- { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
-
- { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
- { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
-
- { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
- { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
- { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
- /* KERN_HZ_TIMER "hz_timer" no longer used */
- { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
- { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
- { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
-
- { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
- /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
- { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
- { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
- { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
- { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
- { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
- { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" },
- {}
-};
-
-static const struct bin_table bin_vm_table[] = {
- { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
- { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
- { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
- { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
- /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
- /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
- /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
- { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
- /* VM_PAGEBUF unused */
- /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
- { CTL_INT, VM_SWAPPINESS, "swappiness" },
- { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
- { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
- { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
- { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
- { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
- { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
- { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
- { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
- /* VM_SWAP_TOKEN_TIMEOUT unused */
- { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
- { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
- { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
- { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
- { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
- { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
- { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
-
- {}
-};
-
-static const struct bin_table bin_net_core_table[] = {
- { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
- { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
- { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
- { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
- /* NET_CORE_DESTROY_DELAY unused */
- { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
- /* NET_CORE_FASTROUTE unused */
- { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
- { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
- { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
- /* NET_CORE_HOT_LIST_LENGTH unused */
- /* NET_CORE_DIVERT_VERSION unused */
- /* NET_CORE_NO_CONG_THRESH unused */
- /* NET_CORE_NO_CONG unused */
- /* NET_CORE_LO_CONG unused */
- /* NET_CORE_MOD_CONG unused */
- { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
- { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
- { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
- { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
- { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
- { CTL_INT, NET_CORE_WARNINGS, "warnings" },
- {},
-};
-
-static const struct bin_table bin_net_unix_table[] = {
- /* NET_UNIX_DESTROY_DELAY unused */
- /* NET_UNIX_DELETE_DELAY unused */
- { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_route_table[] = {
- { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
- /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
- /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
- { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
- { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
- { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
- { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
- { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
- /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
- { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
- { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
- { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
- { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
- { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
- { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
- { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
- { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
- { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
-
- { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
- { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
- { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
- { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
- { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
- { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
- { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
- { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
- { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
- { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
- { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
- { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
- { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
- { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
- { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
- { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
-
- { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
- { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
- { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
- { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_conf_table[] = {
- { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
- { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
- {}
-};
-
-static const struct bin_table bin_net_neigh_vars_table[] = {
- { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
- { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
- { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
- /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
- { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
- { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
- { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
- { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
- { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
- /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
- /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
- /* NET_NEIGH_LOCKTIME "locktime" no longer used */
- { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
- { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
- { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
- { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
- { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
- { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
- {}
-};
-
-static const struct bin_table bin_net_neigh_table[] = {
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
- { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_netfilter_table[] = {
- { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
-
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
-
- /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
- /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
- /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
- /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
-
- { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
- /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
-
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
- /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
-
- { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
- { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
- {}
-};
-
-static const struct bin_table bin_net_ipv4_table[] = {
- {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
-
- { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
- { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
- { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
- /* NET_IPV4_FIB_HASH unused */
- { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
-
- { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
- { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
- { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
- { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
- { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
- /* NET_IPV4_AUTOCONFIG unused */
- { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
- { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
- { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
- { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
- { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
- { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
- { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
- { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
- { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
- { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
- { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
- { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
- { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
- { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
- { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
- { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
- { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
- { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
- { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
- { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
- { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
- { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
- { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
- { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
- { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
- { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
- { CTL_INT, NET_TCP_FACK, "tcp_fack" },
- { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
- { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
- { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
- { CTL_INT, NET_TCP_MEM, "tcp_mem" },
- { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
- { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
- { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
- { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
- { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
- { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
- { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
- { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
- { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
- { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
- { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
- { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
- { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
- { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
- { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
- { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
- { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
- { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
- { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
- { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
- /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
- { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
- { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
-
- { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
- { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
- { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
- { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
- { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
- { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
-
- { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
- { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
- { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
-
- { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
- /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
-
- { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
-
- /* NET_TCP_DEFAULT_WIN_SCALE unused */
- /* NET_TCP_BIC_BETA unused */
- /* NET_IPV4_TCP_MAX_KA_PROBES unused */
- /* NET_IPV4_IP_MASQ_DEBUG unused */
- /* NET_TCP_SYN_TAILDROP unused */
- /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
- /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
- /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
- /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
- /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
- /* NET_IPV4_ALWAYS_DEFRAG unused */
- {}
-};
-
-static const struct bin_table bin_net_ipx_table[] = {
- { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
- /* NET_IPX_FORWARDING unused */
- {}
-};
-
-static const struct bin_table bin_net_atalk_table[] = {
- { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
- { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
- { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
- { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
- {},
-};
-
-static const struct bin_table bin_net_netrom_table[] = {
- { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
- { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
- { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
- { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
- { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
- { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
- { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
- { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
- { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
- { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
- { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
- { CTL_INT, NET_NETROM_RESET, "reset" },
- {}
-};
-
-static const struct bin_table bin_net_ax25_param_table[] = {
- { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
- { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
- { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
- { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
- { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
- { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
- { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
- { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
- { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
- { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
- { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
- { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
- { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
- { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_ax25_table[] = {
- { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
- {}
-};
-
-static const struct bin_table bin_net_rose_table[] = {
- { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
- { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
- { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
- { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
- { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
- { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
- { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
- { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
- { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
- { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_conf_var_table[] = {
- { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
- { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
- { CTL_INT, NET_IPV6_MTU, "mtu" },
- { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
- { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
- { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
- { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
- { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
- { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
- { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
- { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
- { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
- { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
- { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
- { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
- { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
- { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
- { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
- { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
- { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
- { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_conf_table[] = {
- { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
- { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
- { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_route_table[] = {
- /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
- { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
- { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
- { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
- { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
- { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
- { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
- { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
- { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
- { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_icmp_table[] = {
- { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
- {}
-};
-
-static const struct bin_table bin_net_ipv6_table[] = {
- { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
- { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
- { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
- { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
- { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
- { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
- { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
- { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
- { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
- { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
- { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
- {}
-};
-
-static const struct bin_table bin_net_x25_table[] = {
- { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
- { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
- { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
- { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
- { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
- { CTL_INT, NET_X25_FORWARD, "x25_forward" },
- {}
-};
-
-static const struct bin_table bin_net_tr_table[] = {
- { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
- {}
-};
-
-
-static const struct bin_table bin_net_decnet_conf_vars[] = {
- { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
- { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
- { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
- { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
- {}
-};
-
-static const struct bin_table bin_net_decnet_conf[] = {
- { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
- { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
- { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
- {}
-};
-
-static const struct bin_table bin_net_decnet_table[] = {
- { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
- { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
- { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
- { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
- { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
- { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
- { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
- { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
- { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
- { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
- { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
- { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
- { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
- { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
- {}
-};
-
-static const struct bin_table bin_net_sctp_table[] = {
- { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
- { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
- { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
- { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
- { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
- { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
- { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
- { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
- { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
- { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
- { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
- { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
- { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
- { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
- { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
- { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
- { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
- {}
-};
-
-static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
- { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
- { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
- { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
- { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
- {}
-};
-
-static const struct bin_table bin_net_llc_station_table[] = {
- { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
- {}
-};
-
-static const struct bin_table bin_net_llc_llc2_table[] = {
- { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
- {}
-};
-
-static const struct bin_table bin_net_llc_table[] = {
- { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
- { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
- {}
-};
-
-static const struct bin_table bin_net_netfilter_table[] = {
- { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
- /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
- /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
- /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
- /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
- { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
- /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
- { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
- { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
- /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
- /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
- /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
- { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
- { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
- { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
-
- {}
-};
-
-static const struct bin_table bin_net_table[] = {
- { CTL_DIR, NET_CORE, "core", bin_net_core_table },
- /* NET_ETHER not used */
- /* NET_802 not used */
- { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
- { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
- { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
- { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
- { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
- { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
- /* NET_BRIDGE "bridge" no longer used */
- { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
- { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
- { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
- { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
- { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
- /* NET_ECONET not used */
- { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
- { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
- { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
- /* NET_DCCP "dccp" no longer used */
- /* NET_IRDA "irda" no longer used */
- { CTL_INT, 2089, "nf_conntrack_max" },
- {}
-};
-
-static const struct bin_table bin_fs_quota_table[] = {
- { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
- { CTL_INT, FS_DQ_DROPS, "drops" },
- { CTL_INT, FS_DQ_READS, "reads" },
- { CTL_INT, FS_DQ_WRITES, "writes" },
- { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
- { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
- { CTL_INT, FS_DQ_FREE, "free_dquots" },
- { CTL_INT, FS_DQ_SYNCS, "syncs" },
- { CTL_INT, FS_DQ_WARNINGS, "warnings" },
- {}
-};
-
-static const struct bin_table bin_fs_xfs_table[] = {
- { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
- { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
- { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
-
- { CTL_INT, XFS_ERRLEVEL, "error_level" },
- { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
- { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
- { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
- { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
- { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
- { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
- { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
- { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
- { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
- { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
- { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
- {}
-};
-
-static const struct bin_table bin_fs_ocfs2_nm_table[] = {
- { CTL_STR, 1, "hb_ctl_path" },
- {}
-};
-
-static const struct bin_table bin_fs_ocfs2_table[] = {
- { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
- {}
-};
-
-static const struct bin_table bin_inotify_table[] = {
- { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
- { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
- { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
- {}
-};
-
-static const struct bin_table bin_fs_table[] = {
- { CTL_INT, FS_NRINODE, "inode-nr" },
- { CTL_INT, FS_STATINODE, "inode-state" },
- /* FS_MAXINODE unused */
- /* FS_NRDQUOT unused */
- /* FS_MAXDQUOT unused */
- /* FS_NRFILE "file-nr" no longer used */
- { CTL_INT, FS_MAXFILE, "file-max" },
- { CTL_INT, FS_DENTRY, "dentry-state" },
- /* FS_NRSUPER unused */
- /* FS_MAXUPSER unused */
- { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
- { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
- { CTL_INT, FS_LEASES, "leases-enable" },
- { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
- { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
- { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
- { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
- { CTL_ULONG, FS_AIO_NR, "aio-nr" },
- { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
- { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
- { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
- { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
- {}
-};
-
-static const struct bin_table bin_ipmi_table[] = {
- { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
- {}
-};
-
-static const struct bin_table bin_mac_hid_files[] = {
- /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
- /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
- { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
- /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
- {}
-};
-
-static const struct bin_table bin_raid_table[] = {
- { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
- { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
- {}
-};
-
-static const struct bin_table bin_scsi_table[] = {
- { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
- {}
-};
-
-static const struct bin_table bin_dev_table[] = {
- /* DEV_CDROM "cdrom" no longer used */
- /* DEV_HWMON unused */
- /* DEV_PARPORT "parport" no longer used */
- { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
- { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
- { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
- { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
- {}
-};
-
-static const struct bin_table bin_bus_isa_table[] = {
- { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
- { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
- { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
- {}
-};
-
-static const struct bin_table bin_bus_table[] = {
- { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
- {}
-};
-
-
-static const struct bin_table bin_s390dbf_table[] = {
- { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
- { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
- {}
-};
-
-static const struct bin_table bin_sunrpc_table[] = {
- /* CTL_RPCDEBUG "rpc_debug" no longer used */
- /* CTL_NFSDEBUG "nfs_debug" no longer used */
- /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
- /* CTL_NLMDEBUG "nlm_debug" no longer used */
-
- { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
- { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
- { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
- { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
- {}
-};
-
-static const struct bin_table bin_pm_table[] = {
- /* frv specific */
- /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
- { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
- { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
- { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
- {}
-};
-
-static const struct bin_table bin_root_table[] = {
- { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
- { CTL_DIR, CTL_VM, "vm", bin_vm_table },
- { CTL_DIR, CTL_NET, "net", bin_net_table },
- /* CTL_PROC not used */
- { CTL_DIR, CTL_FS, "fs", bin_fs_table },
- /* CTL_DEBUG "debug" no longer used */
- { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
- { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
- { CTL_DIR, CTL_ABI, "abi" },
- /* CTL_CPU not used */
- /* CTL_ARLAN "arlan" no longer used */
- { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
- { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
- { CTL_DIR, CTL_PM, "pm", bin_pm_table },
- {}
-};
-
-static ssize_t bin_dir(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOTDIR;
-}
-
-
-static ssize_t bin_string(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- if (oldval && oldlen) {
- char __user *lastp;
- loff_t pos = 0;
- int ch;
-
- result = vfs_read(file, oldval, oldlen, &pos);
- if (result < 0)
- goto out;
-
- copied = result;
- lastp = oldval + copied - 1;
-
- result = -EFAULT;
- if (get_user(ch, lastp))
- goto out;
-
- /* Trim off the trailing newline */
- if (ch == '\n') {
- result = -EFAULT;
- if (put_user('\0', lastp))
- goto out;
- copied -= 1;
- }
- }
-
- if (newval && newlen) {
- loff_t pos = 0;
-
- result = vfs_write(file, newval, newlen, &pos);
- if (result < 0)
- goto out;
- }
-
- result = copied;
-out:
- return result;
-}
-
-static ssize_t bin_intvec(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t copied = 0;
- char *buffer;
- ssize_t result;
-
- result = -ENOMEM;
- buffer = kmalloc(BUFSZ, GFP_KERNEL);
- if (!buffer)
- goto out;
-
- if (oldval && oldlen) {
- unsigned __user *vec = oldval;
- size_t length = oldlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- result = kernel_read(file, buffer, BUFSZ - 1, &pos);
- if (result < 0)
- goto out_kfree;
-
- str = buffer;
- end = str + result;
- *end++ = '\0';
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- value = simple_strtoul(str, &str, 10);
- while (isspace(*str))
- str++;
-
- result = -EFAULT;
- if (put_user(value, vec + i))
- goto out_kfree;
-
- copied += sizeof(*vec);
- if (!isdigit(*str))
- break;
- }
- }
-
- if (newval && newlen) {
- unsigned __user *vec = newval;
- size_t length = newlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- str = buffer;
- end = str + BUFSZ;
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- result = -EFAULT;
- if (get_user(value, vec + i))
- goto out_kfree;
-
- str += scnprintf(str, end - str, "%lu\t", value);
- }
-
- result = kernel_write(file, buffer, str - buffer, &pos);
- if (result < 0)
- goto out_kfree;
- }
- result = copied;
-out_kfree:
- kfree(buffer);
-out:
- return result;
-}
-
-static ssize_t bin_ulongvec(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t copied = 0;
- char *buffer;
- ssize_t result;
-
- result = -ENOMEM;
- buffer = kmalloc(BUFSZ, GFP_KERNEL);
- if (!buffer)
- goto out;
-
- if (oldval && oldlen) {
- unsigned long __user *vec = oldval;
- size_t length = oldlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- result = kernel_read(file, buffer, BUFSZ - 1, &pos);
- if (result < 0)
- goto out_kfree;
-
- str = buffer;
- end = str + result;
- *end++ = '\0';
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- value = simple_strtoul(str, &str, 10);
- while (isspace(*str))
- str++;
-
- result = -EFAULT;
- if (put_user(value, vec + i))
- goto out_kfree;
-
- copied += sizeof(*vec);
- if (!isdigit(*str))
- break;
- }
- }
-
- if (newval && newlen) {
- unsigned long __user *vec = newval;
- size_t length = newlen / sizeof(*vec);
- char *str, *end;
- int i;
- loff_t pos = 0;
-
- str = buffer;
- end = str + BUFSZ;
- for (i = 0; i < length; i++) {
- unsigned long value;
-
- result = -EFAULT;
- if (get_user(value, vec + i))
- goto out_kfree;
-
- str += scnprintf(str, end - str, "%lu\t", value);
- }
-
- result = kernel_write(file, buffer, str - buffer, &pos);
- if (result < 0)
- goto out_kfree;
- }
- result = copied;
-out_kfree:
- kfree(buffer);
-out:
- return result;
-}
-
-static ssize_t bin_uuid(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- /* Only supports reads */
- if (oldval && oldlen) {
- char buf[UUID_STRING_LEN + 1];
- uuid_t uuid;
- loff_t pos = 0;
-
- result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
- if (result < 0)
- goto out;
-
- buf[result] = '\0';
-
- result = -EIO;
- if (uuid_parse(buf, &uuid))
- goto out;
-
- if (oldlen > 16)
- oldlen = 16;
-
- result = -EFAULT;
- if (copy_to_user(oldval, &uuid, oldlen))
- goto out;
-
- copied = oldlen;
- }
- result = copied;
-out:
- return result;
-}
-
-static ssize_t bin_dn_node_address(struct file *file,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- ssize_t result, copied = 0;
-
- if (oldval && oldlen) {
- char buf[15], *nodep;
- unsigned long area, node;
- __le16 dnaddr;
- loff_t pos = 0;
-
- result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
- if (result < 0)
- goto out;
-
- buf[result] = '\0';
-
- /* Convert the decnet address to binary */
- result = -EIO;
- nodep = strchr(buf, '.');
- if (!nodep)
- goto out;
- ++nodep;
-
- area = simple_strtoul(buf, NULL, 10);
- node = simple_strtoul(nodep, NULL, 10);
-
- result = -EIO;
- if ((area > 63)||(node > 1023))
- goto out;
-
- dnaddr = cpu_to_le16((area << 10) | node);
-
- result = -EFAULT;
- if (put_user(dnaddr, (__le16 __user *)oldval))
- goto out;
-
- copied = sizeof(dnaddr);
- }
-
- if (newval && newlen) {
- __le16 dnaddr;
- char buf[15];
- int len;
- loff_t pos = 0;
-
- result = -EINVAL;
- if (newlen != sizeof(dnaddr))
- goto out;
-
- result = -EFAULT;
- if (get_user(dnaddr, (__le16 __user *)newval))
- goto out;
-
- len = scnprintf(buf, sizeof(buf), "%hu.%hu",
- le16_to_cpu(dnaddr) >> 10,
- le16_to_cpu(dnaddr) & 0x3ff);
-
- result = kernel_write(file, buf, len, &pos);
- if (result < 0)
- goto out;
- }
-
- result = copied;
-out:
- return result;
-}
-
-static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
-{
- const struct bin_table *table = &bin_root_table[0];
- int ctl_name;
-
- /* The binary sysctl tables have a small maximum depth so
- * there is no danger of overflowing our path as it PATH_MAX
- * bytes long.
- */
- memcpy(path, "sys/", 4);
- path += 4;
-
-repeat:
- if (!nlen)
- return ERR_PTR(-ENOTDIR);
- ctl_name = *name;
- name++;
- nlen--;
- for ( ; table->convert; table++) {
- int len = 0;
-
- /*
- * For a wild card entry map from ifindex to network
- * device name.
- */
- if (!table->ctl_name) {
-#ifdef CONFIG_NET
- struct net *net = current->nsproxy->net_ns;
- struct net_device *dev;
- dev = dev_get_by_index(net, ctl_name);
- if (dev) {
- len = strlen(dev->name);
- memcpy(path, dev->name, len);
- dev_put(dev);
- }
-#endif
- /* Use the well known sysctl number to proc name mapping */
- } else if (ctl_name == table->ctl_name) {
- len = strlen(table->procname);
- memcpy(path, table->procname, len);
- }
- if (len) {
- path += len;
- if (table->child) {
- *path++ = '/';
- table = table->child;
- goto repeat;
- }
- *path = '\0';
- return table;
- }
- }
- return ERR_PTR(-ENOTDIR);
-}
-
-static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
-{
- char *tmp, *result;
-
- result = ERR_PTR(-ENOMEM);
- tmp = __getname();
- if (tmp) {
- const struct bin_table *table = get_sysctl(name, nlen, tmp);
- result = tmp;
- *tablep = table;
- if (IS_ERR(table)) {
- __putname(tmp);
- result = ERR_CAST(table);
- }
- }
- return result;
-}
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- const struct bin_table *table = NULL;
- struct vfsmount *mnt;
- struct file *file;
- ssize_t result;
- char *pathname;
- int flags;
-
- pathname = sysctl_getname(name, nlen, &table);
- result = PTR_ERR(pathname);
- if (IS_ERR(pathname))
- goto out;
-
- /* How should the sysctl be accessed? */
- if (oldval && oldlen && newval && newlen) {
- flags = O_RDWR;
- } else if (newval && newlen) {
- flags = O_WRONLY;
- } else if (oldval && oldlen) {
- flags = O_RDONLY;
- } else {
- result = 0;
- goto out_putname;
- }
-
- mnt = task_active_pid_ns(current)->proc_mnt;
- file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
- result = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_putname;
-
- result = table->convert(file, oldval, oldlen, newval, newlen);
-
- fput(file);
-out_putname:
- __putname(pathname);
-out:
- return result;
-}
-
-
-#else /* CONFIG_SYSCTL_SYSCALL */
-
static ssize_t binary_sysctl(const int *name, int nlen,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
return -ENOSYS;
}
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
-
static void deprecated_sysctl_warning(const int *name, int nlen)
{
int i;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 65605530ee34..9e20873148c6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1940,7 +1940,7 @@ out:
return ret;
}
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
struct __kernel_timespec __user *, rmtp)
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 77f1e5635cc1..9e59c9ea92aa 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
* Returns the delta between the expiry time and now, which can be
* less than zero or 1usec for an pending expired timer
*/
-static struct timeval itimer_get_remtime(struct hrtimer *timer)
+static struct timespec64 itimer_get_remtime(struct hrtimer *timer)
{
ktime_t rem = __hrtimer_get_remaining(timer, true);
@@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
} else
rem = 0;
- return ktime_to_timeval(rem);
+ return ktime_to_timespec64(rem);
}
static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
- struct itimerval *const value)
+ struct itimerspec64 *const value)
{
u64 val, interval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -69,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
spin_unlock_irq(&tsk->sighand->siglock);
- value->it_value = ns_to_timeval(val);
- value->it_interval = ns_to_timeval(interval);
+ value->it_value = ns_to_timespec64(val);
+ value->it_interval = ns_to_timespec64(interval);
}
-int do_getitimer(int which, struct itimerval *value)
+static int do_getitimer(int which, struct itimerspec64 *value)
{
struct task_struct *tsk = current;
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
spin_lock_irq(&tsk->sighand->siglock);
value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
value->it_interval =
- ktime_to_timeval(tsk->signal->it_real_incr);
+ ktime_to_timespec64(tsk->signal->it_real_incr);
spin_unlock_irq(&tsk->sighand->siglock);
break;
case ITIMER_VIRTUAL:
@@ -97,34 +97,59 @@ int do_getitimer(int which, struct itimerval *value)
return 0;
}
+static int put_itimerval(struct itimerval __user *o,
+ const struct itimerspec64 *i)
+{
+ struct itimerval v;
+
+ v.it_interval.tv_sec = i->it_interval.tv_sec;
+ v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
+ v.it_value.tv_sec = i->it_value.tv_sec;
+ v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
+ return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0;
+}
+
+
SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
{
- int error = -EFAULT;
- struct itimerval get_buffer;
+ struct itimerspec64 get_buffer;
+ int error = do_getitimer(which, &get_buffer);
- if (value) {
- error = do_getitimer(which, &get_buffer);
- if (!error &&
- copy_to_user(value, &get_buffer, sizeof(get_buffer)))
- error = -EFAULT;
- }
+ if (!error && put_itimerval(value, &get_buffer))
+ error = -EFAULT;
return error;
}
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
+struct old_itimerval32 {
+ struct old_timeval32 it_interval;
+ struct old_timeval32 it_value;
+};
+
+static int put_old_itimerval32(struct old_itimerval32 __user *o,
+ const struct itimerspec64 *i)
+{
+ struct old_itimerval32 v32;
+
+ v32.it_interval.tv_sec = i->it_interval.tv_sec;
+ v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
+ v32.it_value.tv_sec = i->it_value.tv_sec;
+ v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
+ return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0;
+}
+
COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
- struct compat_itimerval __user *, it)
+ struct old_itimerval32 __user *, value)
{
- struct itimerval kit;
- int error = do_getitimer(which, &kit);
+ struct itimerspec64 get_buffer;
+ int error = do_getitimer(which, &get_buffer);
- if (!error && put_compat_itimerval(it, &kit))
+ if (!error && put_old_itimerval32(value, &get_buffer))
error = -EFAULT;
return error;
}
#endif
-
/*
* The timer is automagically restarted, when interval != 0
*/
@@ -141,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
}
static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
- const struct itimerval *const value,
- struct itimerval *const ovalue)
+ const struct itimerspec64 *const value,
+ struct itimerspec64 *const ovalue)
{
u64 oval, nval, ointerval, ninterval;
struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -151,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
* Use the to_ktime conversion because that clamps the maximum
* value to KTIME_MAX and avoid multiplication overflows.
*/
- nval = ktime_to_ns(timeval_to_ktime(value->it_value));
- ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval));
+ nval = timespec64_to_ns(&value->it_value);
+ ninterval = timespec64_to_ns(&value->it_interval);
spin_lock_irq(&tsk->sighand->siglock);
@@ -171,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
- ovalue->it_value = ns_to_timeval(oval);
- ovalue->it_interval = ns_to_timeval(ointerval);
+ ovalue->it_value = ns_to_timespec64(oval);
+ ovalue->it_interval = ns_to_timespec64(ointerval);
}
}
@@ -182,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+static int do_setitimer(int which, struct itimerspec64 *value,
+ struct itimerspec64 *ovalue)
{
struct task_struct *tsk = current;
struct hrtimer *timer;
ktime_t expires;
- /*
- * Validate the timevals in value.
- */
- if (!timeval_valid(&value->it_value) ||
- !timeval_valid(&value->it_interval))
- return -EINVAL;
-
switch (which) {
case ITIMER_REAL:
again:
@@ -203,7 +222,7 @@ again:
if (ovalue) {
ovalue->it_value = itimer_get_remtime(timer);
ovalue->it_interval
- = ktime_to_timeval(tsk->signal->it_real_incr);
+ = ktime_to_timespec64(tsk->signal->it_real_incr);
}
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
@@ -211,10 +230,10 @@ again:
hrtimer_cancel_wait_running(timer);
goto again;
}
- expires = timeval_to_ktime(value->it_value);
+ expires = timespec64_to_ktime(value->it_value);
if (expires != 0) {
tsk->signal->it_real_incr =
- timeval_to_ktime(value->it_interval);
+ timespec64_to_ktime(value->it_interval);
hrtimer_start(timer, expires, HRTIMER_MODE_REL);
} else
tsk->signal->it_real_incr = 0;
@@ -234,6 +253,17 @@ again:
return 0;
}
+#ifdef CONFIG_SECURITY_SELINUX
+void clear_itimer(void)
+{
+ struct itimerspec64 v = {};
+ int i;
+
+ for (i = 0; i < 3; i++)
+ do_setitimer(i, &v, NULL);
+}
+#endif
+
#ifdef __ARCH_WANT_SYS_ALARM
/**
@@ -250,15 +280,15 @@ again:
*/
static unsigned int alarm_setitimer(unsigned int seconds)
{
- struct itimerval it_new, it_old;
+ struct itimerspec64 it_new, it_old;
#if BITS_PER_LONG < 64
if (seconds > INT_MAX)
seconds = INT_MAX;
#endif
it_new.it_value.tv_sec = seconds;
- it_new.it_value.tv_usec = 0;
- it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+ it_new.it_value.tv_nsec = 0;
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0;
do_setitimer(ITIMER_REAL, &it_new, &it_old);
@@ -266,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds)
* We can't return 0 if we have an alarm pending ... And we'd
* better return too much than too little anyway
*/
- if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
- it_old.it_value.tv_usec >= 500000)
+ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) ||
+ it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2))
it_old.it_value.tv_sec++;
return it_old.it_value.tv_sec;
@@ -284,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
+static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i)
+{
+ struct itimerval v;
+
+ if (copy_from_user(&v, i, sizeof(struct itimerval)))
+ return -EFAULT;
+
+ /* Validate the timevals in value. */
+ if (!timeval_valid(&v.it_value) ||
+ !timeval_valid(&v.it_interval))
+ return -EINVAL;
+
+ o->it_interval.tv_sec = v.it_interval.tv_sec;
+ o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC;
+ o->it_value.tv_sec = v.it_value.tv_sec;
+ o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC;
+ return 0;
+}
+
SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
struct itimerval __user *, ovalue)
{
- struct itimerval set_buffer, get_buffer;
+ struct itimerspec64 set_buffer, get_buffer;
int error;
if (value) {
- if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
- return -EFAULT;
+ error = get_itimerval(&set_buffer, value);
+ if (error)
+ return error;
} else {
memset(&set_buffer, 0, sizeof(set_buffer));
printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
@@ -304,30 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
if (error || !ovalue)
return error;
- if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ if (put_itimerval(ovalue, &get_buffer))
+ return -EFAULT;
+ return 0;
+}
+
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
+static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i)
+{
+ struct old_itimerval32 v32;
+
+ if (copy_from_user(&v32, i, sizeof(struct old_itimerval32)))
return -EFAULT;
+
+ /* Validate the timevals in value. */
+ if (!timeval_valid(&v32.it_value) ||
+ !timeval_valid(&v32.it_interval))
+ return -EINVAL;
+
+ o->it_interval.tv_sec = v32.it_interval.tv_sec;
+ o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC;
+ o->it_value.tv_sec = v32.it_value.tv_sec;
+ o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC;
return 0;
}
-#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
- struct compat_itimerval __user *, in,
- struct compat_itimerval __user *, out)
+ struct old_itimerval32 __user *, value,
+ struct old_itimerval32 __user *, ovalue)
{
- struct itimerval kin, kout;
+ struct itimerspec64 set_buffer, get_buffer;
int error;
- if (in) {
- if (get_compat_itimerval(&kin, in))
- return -EFAULT;
+ if (value) {
+ error = get_old_itimerval32(&set_buffer, value);
+ if (error)
+ return error;
} else {
- memset(&kin, 0, sizeof(kin));
+ memset(&set_buffer, 0, sizeof(set_buffer));
+ printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+ " Misfeature support will be removed\n",
+ current->comm);
}
- error = do_setitimer(which, &kin, out ? &kout : NULL);
- if (error || !out)
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+ if (error || !ovalue)
return error;
- if (put_compat_itimerval(out, &kout))
+ if (put_old_itimerval32(ovalue, &get_buffer))
return -EFAULT;
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 955851748dc3..8b192e67aabc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -172,6 +172,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
bool tick_nohz_full_running;
+EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
static bool check_tick_dependency(atomic_t *dep)
@@ -198,6 +199,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU);
+ return true;
+ }
+
return false;
}
@@ -324,6 +330,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -331,6 +338,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
@@ -344,11 +352,13 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
*/
tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
@@ -397,6 +407,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
+EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@@ -1119,7 +1130,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
/*
* We stopped the tick in idle. Update process times would miss the
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5c54ca632d08..58e312e7380f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz);
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
-SYSCALL_DEFINE1(time, time_t __user *, tloc)
+SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
- time_t i = (time_t)ktime_get_real_seconds();
+ __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();
if (tloc) {
if (put_user(i,tloc))
@@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
* architectures that need it).
*/
-SYSCALL_DEFINE1(stime, time_t __user *, tptr)
+SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
struct timespec64 tv;
int err;
@@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
-SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
if (likely(tv != NULL)) {
@@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
return 0;
}
-SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
- struct timeval user_tv;
struct timezone new_tz;
if (tv) {
- if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+ if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+ get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (!timeval_valid(&user_tv))
+ if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
- new_ts.tv_sec = user_tv.tv_sec;
- new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
@@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
- struct timeval user_tv;
struct timezone new_tz;
if (tv) {
- if (compat_get_timeval(&user_tv, tv))
+ if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+ get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (!timeval_valid(&user_tv))
+ if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
- new_ts.tv_sec = user_tv.tv_sec;
- new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
@@ -267,7 +265,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
}
#endif
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
struct __kernel_timex txc; /* Local copy of parameter */
@@ -881,7 +879,7 @@ int get_timespec64(struct timespec64 *ts,
ts->tv_sec = kts.tv_sec;
/* Zero out the padding for 32 bit systems or in compat mode */
- if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+ if (in_compat_syscall())
kts.tv_nsec &= 0xFFFFFFFFUL;
ts->tv_nsec = kts.tv_nsec;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e08527f50d2a..cdf5afa87f65 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE
config HAVE_DYNAMIC_FTRACE_WITH_REGS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -76,7 +79,7 @@ config FTRACE_NMI_ENTER
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
- select GLOB
+ select GLOB
bool
config CONTEXT_SWITCH_TRACER
@@ -106,7 +109,6 @@ config PREEMPTIRQ_TRACEPOINTS
config TRACING
bool
- select DEBUG_FS
select RING_BUFFER
select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
@@ -308,7 +310,7 @@ config TRACER_SNAPSHOT
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
- bool "Allow snapshot to swap per CPU"
+ bool "Allow snapshot to swap per CPU"
depends on TRACER_SNAPSHOT
select RING_BUFFER_ALLOW_SWAP
help
@@ -557,6 +559,11 @@ config DYNAMIC_FTRACE_WITH_REGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -675,7 +682,7 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
config TRACEPOINT_BENCHMARK
- bool "Add tracepoint that benchmarks tracepoints"
+ bool "Add tracepoint that benchmarks tracepoints"
help
This option creates the tracepoint "benchmark:benchmark_event".
When the tracepoint is enabled, it kicks off a kernel thread that
@@ -724,7 +731,7 @@ config RING_BUFFER_STARTUP_TEST
bool "Ring buffer startup self test"
depends on RING_BUFFER
help
- Run a simple self test on the ring buffer on boot up. Late in the
+ Run a simple self test on the ring buffer on boot up. Late in the
kernel boot sequence, the test will start that kicks off
a thread per cpu. Each thread will write various size events
into the ring buffer. Another thread is created to send IPIs
@@ -752,9 +759,9 @@ config PREEMPTIRQ_DELAY_TEST
configurable delay. The module busy waits for the duration of the
critical section.
- For example, the following invocation forces a one-time irq-disabled
- critical section for 500us:
- modprobe preemptirq_delay_test test_mode=irq delay=500000
+ For example, the following invocation generates a burst of three
+ irq-disabled critical sections for 500us:
+ modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
If unsure, say N
@@ -763,7 +770,7 @@ config TRACE_EVAL_MAP_FILE
depends on TRACING
help
The "print fmt" of the trace events will show the enum/sizeof names
- instead of their values. This can cause problems for user space tools
+ instead of their values. This can cause problems for user space tools
that use this string to parse the raw data as user space does not know
how to convert the string to its value.
@@ -784,7 +791,7 @@ config TRACE_EVAL_MAP_FILE
they are needed for the "eval_map" file. Enabling this option will
increase the memory footprint of the running kernel.
- If unsure, say N
+ If unsure, say N.
config GCOV_PROFILE_FTRACE
bool "Enable GCOV profiling on ftrace subsystem"
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 7950a0356042..67e0c462b059 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
return 0;
}
+/*
+ * Simply points to ftrace_stub, but with the proper protocol.
+ * Defined by the linker script in linux/vmlinux.lds.h
+ */
+extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+
/* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
- (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
goto out;
ftrace_graph_active--;
- ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_return = ftrace_stub_graph;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5259d4dea675..74439ab5c2b6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
#endif
+ if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
+ return -EBUSY;
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
@@ -463,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* function graph compares on total time */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_profile *a = p1;
- struct ftrace_profile *b = p2;
+ const struct ftrace_profile *a = p1;
+ const struct ftrace_profile *b = p2;
if (a->time < b->time)
return -1;
@@ -477,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2)
}
#else
/* not function graph compares against hits */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_profile *a = p1;
- struct ftrace_profile *b = p2;
+ const struct ftrace_profile *a = p1;
+ const struct ftrace_profile *b = p2;
if (a->counter < b->counter)
return -1;
@@ -1018,11 +1020,6 @@ static bool update_all_ops;
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
-struct ftrace_func_entry {
- struct hlist_node hlist;
- unsigned long ip;
-};
-
struct ftrace_func_probe {
struct ftrace_probe_ops *probe_ops;
struct ftrace_ops ops;
@@ -1370,24 +1367,16 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *
-__ftrace_hash_move(struct ftrace_hash *src)
+static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tn;
- struct hlist_head *hhd;
struct ftrace_hash *new_hash;
- int size = src->count;
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
int bits = 0;
int i;
/*
- * If the new source is empty, just return the empty_hash.
- */
- if (ftrace_hash_empty(src))
- return EMPTY_HASH;
-
- /*
* Make the hash size about 1/2 the # found
*/
for (size /= 2; size; size >>= 1)
@@ -1411,10 +1400,23 @@ __ftrace_hash_move(struct ftrace_hash *src)
__add_hash_entry(new_hash, entry);
}
}
-
return new_hash;
}
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
+{
+ int size = src->count;
+
+ /*
+ * If the new source is empty, just return the empty_hash.
+ */
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return dup_hash(src, size);
+}
+
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1534,6 +1536,26 @@ static int ftrace_cmp_recs(const void *a, const void *b)
return 0;
}
+static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec = NULL;
+ struct dyn_ftrace key;
+
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ }
+ return rec;
+}
+
/**
* ftrace_location_range - return the first address of a traced location
* if it touches the given ip range
@@ -1548,23 +1570,11 @@ static int ftrace_cmp_recs(const void *a, const void *b)
*/
unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
- struct ftrace_page *pg;
struct dyn_ftrace *rec;
- struct dyn_ftrace key;
-
- key.ip = start;
- key.flags = end; /* overload flags, as it is unsigned long */
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
- start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
- continue;
- rec = bsearch(&key, pg->records, pg->index,
- sizeof(struct dyn_ftrace),
- ftrace_cmp_recs);
- if (rec)
- return rec->ip;
- }
+ rec = lookup_rec(start, end);
+ if (rec)
+ return rec->ip;
return 0;
}
@@ -1715,6 +1725,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return false;
+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags |= FTRACE_FL_DIRECT;
+
/*
* If there's only a single callback registered to a
* function, and the ops has a trampoline registered
@@ -1743,6 +1756,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
rec->flags--;
/*
+ * Only the internal direct_ops should have the
+ * DIRECT flag set. Thus, if it is removing a
+ * function, then that function should no longer
+ * be direct.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags &= ~FTRACE_FL_DIRECT;
+
+ /*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
* still any ops for this record that wants regs.
@@ -2077,15 +2099,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* If enabling and the REGS flag does not match the REGS_EN, or
* the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
* this record. Set flags to fail the compare against ENABLED.
+ * Same for direct calls.
*/
if (flag) {
- if (!(rec->flags & FTRACE_FL_REGS) !=
+ if (!(rec->flags & FTRACE_FL_REGS) !=
!(rec->flags & FTRACE_FL_REGS_EN))
flag |= FTRACE_FL_REGS;
- if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
!(rec->flags & FTRACE_FL_TRAMP_EN))
flag |= FTRACE_FL_TRAMP;
+
+ /*
+ * Direct calls are special, as count matters.
+ * We must test the record for direct, if the
+ * DIRECT and DIRECT_EN do not match, but only
+ * if the count is 1. That's because, if the
+ * count is something other than one, we do not
+ * want the direct enabled (it will be done via the
+ * direct helper). But if DIRECT_EN is set, and
+ * the count is not one, we need to clear it.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_DIRECT) !=
+ !(rec->flags & FTRACE_FL_DIRECT_EN))
+ flag |= FTRACE_FL_DIRECT;
+ } else if (rec->flags & FTRACE_FL_DIRECT_EN) {
+ flag |= FTRACE_FL_DIRECT;
+ }
}
/* If the state of this record hasn't changed, then do nothing */
@@ -2110,6 +2151,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
else
rec->flags &= ~FTRACE_FL_TRAMP_EN;
}
+ if (flag & FTRACE_FL_DIRECT) {
+ /*
+ * If there's only one user (direct_ops helper)
+ * then we can call the direct function
+ * directly (no ftrace trampoline).
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_DIRECT)
+ rec->flags |= FTRACE_FL_DIRECT_EN;
+ else
+ rec->flags &= ~FTRACE_FL_DIRECT_EN;
+ } else {
+ /*
+ * Can only call directly if there's
+ * only one callback to the function.
+ */
+ rec->flags &= ~FTRACE_FL_DIRECT_EN;
+ }
+ }
}
/*
@@ -2139,7 +2199,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2294,6 +2354,52 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/* Protected by rcu_tasks for reading, and direct_mutex for writing */
+static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static DEFINE_MUTEX(direct_mutex);
+int ftrace_direct_func_count;
+
+/*
+ * Search the direct_functions hash to see if the given instruction pointer
+ * has a direct caller attached to it.
+ */
+static unsigned long find_rec_direct(unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = __ftrace_lookup_ip(direct_functions, ip);
+ if (!entry)
+ return 0;
+
+ return entry->direct;
+}
+
+static void call_direct_funcs(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ unsigned long addr;
+
+ addr = find_rec_direct(ip);
+ if (!addr)
+ return;
+
+ arch_ftrace_set_direct_caller(regs, addr);
+}
+
+struct ftrace_ops direct_ops = {
+ .func = call_direct_funcs,
+ .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+ | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+ | FTRACE_OPS_FL_PERMANENT,
+};
+#else
+static inline unsigned long find_rec_direct(unsigned long ip)
+{
+ return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -2307,6 +2413,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
+ unsigned long addr;
+
+ if ((rec->flags & FTRACE_FL_DIRECT) &&
+ (ftrace_rec_count(rec) == 1)) {
+ addr = find_rec_direct(rec->ip);
+ if (addr)
+ return addr;
+ WARN_ON_ONCE(1);
+ }
/* Trampolines take precedence over regs */
if (rec->flags & FTRACE_FL_TRAMP) {
@@ -2339,6 +2454,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
+ unsigned long addr;
+
+ /* Direct calls take precedence over trampolines */
+ if (rec->flags & FTRACE_FL_DIRECT_EN) {
+ addr = find_rec_direct(rec->ip);
+ if (addr)
+ return addr;
+ WARN_ON_ONCE(1);
+ }
/* Trampolines take precedence over regs */
if (rec->flags & FTRACE_FL_TRAMP_EN) {
@@ -2861,6 +2985,8 @@ static void ftrace_shutdown_sysctl(void)
static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
+unsigned long ftrace_number_of_pages;
+unsigned long ftrace_number_of_groups;
static inline int ops_traces_mod(struct ftrace_ops *ops)
{
@@ -2985,6 +3111,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
goto again;
}
+ ftrace_number_of_pages += 1 << order;
+ ftrace_number_of_groups++;
+
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
pg->size = cnt;
@@ -3040,6 +3169,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
start_pg = pg->next;
kfree(pg);
pg = start_pg;
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
@@ -3450,10 +3581,11 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
- seq_printf(m, " (%ld)%s%s",
+ seq_printf(m, " (%ld)%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
- rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
+ rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -3469,6 +3601,13 @@ static int t_show(struct seq_file *m, void *v)
} else {
add_trampoline_func(m, NULL, rec);
}
+ if (rec->flags & FTRACE_FL_DIRECT) {
+ unsigned long direct;
+
+ direct = find_rec_direct(rec->ip);
+ if (direct)
+ seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
+ }
}
seq_putc(m, '\n');
@@ -4800,6 +4939,366 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
+struct ftrace_direct_func {
+ struct list_head next;
+ unsigned long addr;
+ int count;
+};
+
+static LIST_HEAD(ftrace_direct_funcs);
+
+/**
+ * ftrace_find_direct_func - test an address if it is a registered direct caller
+ * @addr: The address of a registered direct caller
+ *
+ * This searches to see if a ftrace direct caller has been registered
+ * at a specific address, and if so, it returns a descriptor for it.
+ *
+ * This can be used by architecture code to see if an address is
+ * a direct caller (trampoline) attached to a fentry/mcount location.
+ * This is useful for the function_graph tracer, as it may need to
+ * do adjustments if it traced a location that also has a direct
+ * trampoline attached to it.
+ */
+struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *entry;
+ bool found = false;
+
+ /* May be called by fgraph trampoline (protected by rcu tasks) */
+ list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
+ if (entry->addr == addr) {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ return entry;
+
+ return NULL;
+}
+
+/**
+ * register_ftrace_direct - Call a custom trampoline directly
+ * @ip: The address of the nop at the beginning of a function
+ * @addr: The address of the trampoline to call at @ip
+ *
+ * This is used to connect a direct call from the nop location (@ip)
+ * at the start of ftrace traced functions. The location that it calls
+ * (@addr) must be able to handle a direct call, and save the parameters
+ * of the function being traced, and restore them (or inject new ones
+ * if needed), before returning.
+ *
+ * Returns:
+ * 0 on success
+ * -EBUSY - Another direct function is already attached (there can be only one)
+ * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ * -ENOMEM - There was an allocation failure.
+ */
+int register_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *free_hash = NULL;
+ struct dyn_ftrace *rec;
+ int ret = -EBUSY;
+
+ mutex_lock(&direct_mutex);
+
+ /* See if there's a direct function at @ip already */
+ if (find_rec_direct(ip))
+ goto out_unlock;
+
+ ret = -ENODEV;
+ rec = lookup_rec(ip, ip);
+ if (!rec)
+ goto out_unlock;
+
+ /*
+ * Check if the rec says it has a direct call but we didn't
+ * find one earlier?
+ */
+ if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
+ goto out_unlock;
+
+ /* Make sure the ip points to the exact record */
+ if (ip != rec->ip) {
+ ip = rec->ip;
+ /* Need to check this ip for a direct. */
+ if (find_rec_direct(ip))
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ if (ftrace_hash_empty(direct_functions) ||
+ direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+ struct ftrace_hash *new_hash;
+ int size = ftrace_hash_empty(direct_functions) ? 0 :
+ direct_functions->count + 1;
+
+ if (size < 32)
+ size = 32;
+
+ new_hash = dup_hash(direct_functions, size);
+ if (!new_hash)
+ goto out_unlock;
+
+ free_hash = direct_functions;
+ direct_functions = new_hash;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out_unlock;
+
+ direct = ftrace_find_direct_func(addr);
+ if (!direct) {
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct) {
+ kfree(entry);
+ goto out_unlock;
+ }
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ }
+
+ entry->ip = ip;
+ entry->direct = addr;
+ __add_hash_entry(direct_functions, entry);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
+ if (ret)
+ remove_hash_entry(direct_functions, entry);
+
+ if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
+ ret = register_ftrace_function(&direct_ops);
+ if (ret)
+ ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+ }
+
+ if (ret) {
+ kfree(entry);
+ if (!direct->count) {
+ list_del_rcu(&direct->next);
+ synchronize_rcu_tasks();
+ kfree(direct);
+ if (free_hash)
+ free_ftrace_hash(free_hash);
+ free_hash = NULL;
+ ftrace_direct_func_count--;
+ }
+ } else {
+ direct->count++;
+ }
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash) {
+ synchronize_rcu_tasks();
+ free_ftrace_hash(free_hash);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
+
+static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
+ struct dyn_ftrace **recp)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+
+ rec = lookup_rec(*ip, *ip);
+ if (!rec)
+ return NULL;
+
+ entry = __ftrace_lookup_ip(direct_functions, rec->ip);
+ if (!entry) {
+ WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ return NULL;
+ }
+
+ WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+
+ /* Passed in ip just needs to be on the call site */
+ *ip = rec->ip;
+
+ if (recp)
+ *recp = rec;
+
+ return entry;
+}
+
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+ struct ftrace_func_entry *entry;
+ int ret = -ENODEV;
+
+ mutex_lock(&direct_mutex);
+
+ entry = find_direct_entry(&ip, NULL);
+ if (!entry)
+ goto out_unlock;
+
+ if (direct_functions->count == 1)
+ unregister_ftrace_function(&direct_ops);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+
+ WARN_ON(ret);
+
+ remove_hash_entry(direct_functions, entry);
+
+ direct = ftrace_find_direct_func(addr);
+ if (!WARN_ON(!direct)) {
+ /* This is the good path (see the ! before WARN) */
+ direct->count--;
+ WARN_ON(direct->count < 0);
+ if (!direct->count) {
+ list_del_rcu(&direct->next);
+ synchronize_rcu_tasks();
+ kfree(direct);
+ ftrace_direct_func_count--;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+
+static struct ftrace_ops stub_ops = {
+ .func = ftrace_stub,
+};
+
+/**
+ * ftrace_modify_direct_caller - modify ftrace nop directly
+ * @entry: The ftrace hash entry of the direct helper for @rec
+ * @rec: The record representing the function site to patch
+ * @old_addr: The location that the site at @rec->ip currently calls
+ * @new_addr: The location that the site at @rec->ip should call
+ *
+ * An architecture may overwrite this function to optimize the
+ * changing of the direct callback on an ftrace nop location.
+ * This is called with the ftrace_lock mutex held, and no other
+ * ftrace callbacks are on the associated record (@rec). Thus,
+ * it is safe to modify the ftrace record, where it should be
+ * currently calling @old_addr directly, to call @new_addr.
+ *
+ * Safety checks should be made to make sure that the code at
+ * @rec->ip is currently calling @old_addr. And this must
+ * also update entry->direct to @new_addr.
+ */
+int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
+ struct dyn_ftrace *rec,
+ unsigned long old_addr,
+ unsigned long new_addr)
+{
+ unsigned long ip = rec->ip;
+ int ret;
+
+ /*
+ * The ftrace_lock was used to determine if the record
+ * had more than one registered user to it. If it did,
+ * we needed to prevent that from changing to do the quick
+ * switch. But if it did not (only a direct caller was attached)
+ * then this function is called. But this function can deal
+ * with attached callers to the rec that we care about, and
+ * since this function uses standard ftrace calls that take
+ * the ftrace_lock mutex, we need to release it.
+ */
+ mutex_unlock(&ftrace_lock);
+
+ /*
+ * By setting a stub function at the same address, we force
+ * the code to call the iterator and the direct_ops helper.
+ * This means that @ip does not call the direct call, and
+ * we can simply modify it.
+ */
+ ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
+ if (ret)
+ goto out_lock;
+
+ ret = register_ftrace_function(&stub_ops);
+ if (ret) {
+ ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+ goto out_lock;
+ }
+
+ entry->direct = new_addr;
+
+ /*
+ * By removing the stub, we put back the direct call, calling
+ * the @new_addr.
+ */
+ unregister_ftrace_function(&stub_ops);
+ ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+
+ out_lock:
+ mutex_lock(&ftrace_lock);
+
+ return ret;
+}
+
+/**
+ * modify_ftrace_direct - Modify an existing direct call to call something else
+ * @ip: The instruction pointer to modify
+ * @old_addr: The address that the current @ip calls directly
+ * @new_addr: The address that the @ip should call
+ *
+ * This modifies a ftrace direct caller at an instruction pointer without
+ * having to disable it first. The direct call will switch over to the
+ * @new_addr without missing anything.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -ENODEV : the @ip given has no direct caller attached
+ * -EINVAL : the @old_addr does not match the current direct caller
+ */
+int modify_ftrace_direct(unsigned long ip,
+ unsigned long old_addr, unsigned long new_addr)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int ret = -ENODEV;
+
+ mutex_lock(&direct_mutex);
+
+ mutex_lock(&ftrace_lock);
+ entry = find_direct_entry(&ip, &rec);
+ if (!entry)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ if (entry->direct != old_addr)
+ goto out_unlock;
+
+ /*
+ * If there's no other ftrace callback on the rec->ip location,
+ * then it can be changed directly by the architecture.
+ * If there is another caller, then we just need to change the
+ * direct caller helper to point to @new_addr.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
+ } else {
+ entry->direct = new_addr;
+ ret = 0;
+ }
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+ mutex_unlock(&direct_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
/**
* ftrace_set_filter_ip - set a function to filter on in ftrace by address
* @ops - the ops to set the filter with
@@ -5818,6 +6317,8 @@ void ftrace_release_mod(struct module *mod)
free_pages((unsigned long)pg->records, order);
tmp_page = pg->next;
kfree(pg);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
}
@@ -6159,6 +6660,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
@@ -6214,6 +6717,9 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
+ pr_info("ftrace: allocated %ld pages with %ld groups\n",
+ ftrace_number_of_pages, ftrace_number_of_groups);
+
set_ftrace_early_filters();
return;
@@ -6754,6 +7260,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+static bool is_permanent_ops_registered(void)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->flags & FTRACE_OPS_FL_PERMANENT)
+ return true;
+ } while_for_each_ftrace_op(op);
+
+ return false;
+}
+
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -6771,8 +7289,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
- last_ftrace_enabled = !!ftrace_enabled;
-
if (ftrace_enabled) {
/* we are starting ftrace again */
@@ -6783,12 +7299,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
} else {
+ if (is_permanent_ops_registered()) {
+ ftrace_enabled = true;
+ ret = -EBUSY;
+ goto out;
+ }
+
/* stopping ftrace calls (just send to ftrace_stub) */
ftrace_trace_function = ftrace_stub;
ftrace_shutdown_sysctl();
}
+ last_ftrace_enabled = !!ftrace_enabled;
out:
mutex_unlock(&ftrace_lock);
return ret;
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index d8765c952fab..31c0fad4cb9e 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -10,18 +10,25 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kernel.h>
+#include <linux/kobject.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/string.h>
+#include <linux/sysfs.h>
static ulong delay = 100;
-static char test_mode[10] = "irq";
+static char test_mode[12] = "irq";
+static uint burst_size = 1;
-module_param_named(delay, delay, ulong, S_IRUGO);
-module_param_string(test_mode, test_mode, 10, S_IRUGO);
-MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)");
-MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)");
+module_param_named(delay, delay, ulong, 0444);
+module_param_string(test_mode, test_mode, 12, 0444);
+module_param_named(burst_size, burst_size, uint, 0444);
+MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
+MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
+MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
static void busy_wait(ulong time)
{
@@ -34,37 +41,136 @@ static void busy_wait(ulong time)
} while ((end - start) < (time * 1000));
}
-static int preemptirq_delay_run(void *data)
+static __always_inline void irqoff_test(void)
{
unsigned long flags;
+ local_irq_save(flags);
+ busy_wait(delay);
+ local_irq_restore(flags);
+}
- if (!strcmp(test_mode, "irq")) {
- local_irq_save(flags);
- busy_wait(delay);
- local_irq_restore(flags);
- } else if (!strcmp(test_mode, "preempt")) {
- preempt_disable();
- busy_wait(delay);
- preempt_enable();
+static __always_inline void preemptoff_test(void)
+{
+ preempt_disable();
+ busy_wait(delay);
+ preempt_enable();
+}
+
+static void execute_preemptirqtest(int idx)
+{
+ if (!strcmp(test_mode, "irq"))
+ irqoff_test();
+ else if (!strcmp(test_mode, "preempt"))
+ preemptoff_test();
+ else if (!strcmp(test_mode, "alternate")) {
+ if (idx % 2 == 0)
+ irqoff_test();
+ else
+ preemptoff_test();
}
+}
+
+#define DECLARE_TESTFN(POSTFIX) \
+ static void preemptirqtest_##POSTFIX(int idx) \
+ { \
+ execute_preemptirqtest(idx); \
+ } \
+/*
+ * We create 10 different functions, so that we can get 10 different
+ * backtraces.
+ */
+DECLARE_TESTFN(0)
+DECLARE_TESTFN(1)
+DECLARE_TESTFN(2)
+DECLARE_TESTFN(3)
+DECLARE_TESTFN(4)
+DECLARE_TESTFN(5)
+DECLARE_TESTFN(6)
+DECLARE_TESTFN(7)
+DECLARE_TESTFN(8)
+DECLARE_TESTFN(9)
+
+static void (*testfuncs[])(int) = {
+ preemptirqtest_0,
+ preemptirqtest_1,
+ preemptirqtest_2,
+ preemptirqtest_3,
+ preemptirqtest_4,
+ preemptirqtest_5,
+ preemptirqtest_6,
+ preemptirqtest_7,
+ preemptirqtest_8,
+ preemptirqtest_9,
+};
+
+#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs)
+
+static int preemptirq_delay_run(void *data)
+{
+ int i;
+ int s = MIN(burst_size, NR_TEST_FUNCS);
+
+ for (i = 0; i < s; i++)
+ (testfuncs[i])(i);
return 0;
}
-static int __init preemptirq_delay_init(void)
+static struct task_struct *preemptirq_start_test(void)
{
char task_name[50];
- struct task_struct *test_task;
snprintf(task_name, sizeof(task_name), "%s_test", test_mode);
+ return kthread_run(preemptirq_delay_run, NULL, task_name);
+}
+
+
+static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ preemptirq_start_test();
+ return count;
+}
+
+static struct kobj_attribute trigger_attribute =
+ __ATTR(trigger, 0200, NULL, trigger_store);
+
+static struct attribute *attrs[] = {
+ &trigger_attribute.attr,
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+static struct kobject *preemptirq_delay_kobj;
+
+static int __init preemptirq_delay_init(void)
+{
+ struct task_struct *test_task;
+ int retval;
+
+ test_task = preemptirq_start_test();
+ retval = PTR_ERR_OR_ZERO(test_task);
+ if (retval != 0)
+ return retval;
+
+ preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test",
+ kernel_kobj);
+ if (!preemptirq_delay_kobj)
+ return -ENOMEM;
+
+ retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group);
+ if (retval)
+ kobject_put(preemptirq_delay_kobj);
- test_task = kthread_run(preemptirq_delay_run, NULL, task_name);
- return PTR_ERR_OR_ZERO(test_task);
+ return retval;
}
static void __exit preemptirq_delay_exit(void)
{
- return;
+ kobject_put(preemptirq_delay_kobj);
}
module_init(preemptirq_delay_init)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 09b0b49f346e..32149e46551c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -269,10 +269,10 @@ static void ring_buffer_producer(void)
#ifndef CONFIG_PREEMPTION
/*
- * If we are a non preempt kernel, the 10 second run will
+ * If we are a non preempt kernel, the 10 seconds run will
* stop everything while it runs. Instead, we will call
* cond_resched and also add any time that was lost by a
- * rescedule.
+ * reschedule.
*
* Do a cond resched at the same frequency we would wake up
* the reader.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6a0ee9178365..02a23a6e5e00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -45,6 +45,9 @@
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
+#include <linux/fsnotify.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
#include "trace.h"
#include "trace_output.h"
@@ -298,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr)
this_tr->ref--;
}
+/**
+ * trace_array_put - Decrement the reference counter for this trace array.
+ *
+ * NOTE: Use this when we no longer need the trace array returned by
+ * trace_array_get_by_name(). This ensures the trace array can be later
+ * destroyed.
+ *
+ */
void trace_array_put(struct trace_array *this_tr)
{
+ if (!this_tr)
+ return;
+
mutex_lock(&trace_types_lock);
__trace_array_put(this_tr);
mutex_unlock(&trace_types_lock);
}
+EXPORT_SYMBOL_GPL(trace_array_put);
int tracing_check_open_get_tr(struct trace_array *tr)
{
@@ -1497,6 +1512,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
}
unsigned long __read_mostly tracing_thresh;
+static const struct file_operations tracing_max_lat_fops;
+
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array,
+ fsnotify_work);
+ fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
+ tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+ struct trace_array *tr = container_of(iwork, struct trace_array,
+ fsnotify_irqwork);
+ queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+static void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+ tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ d_tracer, &tr->max_latency,
+ &tracing_max_lat_fops);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+ fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!fsnotify_wq) {
+ pr_err("Unable to allocate tr_max_lat_wq\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+ if (!fsnotify_wq)
+ return;
+ /*
+ * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+ * possible that we are called from __schedule() or do_idle(), which
+ * could cause a deadlock.
+ */
+ irq_work_queue(&tr->fsnotify_irqwork);
+}
+
+/*
+ * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ * defined(CONFIG_FSNOTIFY)
+ */
+#else
+
+#define trace_create_maxlat_file(tr, d_tracer) \
+ trace_create_file("tracing_max_latency", 0644, d_tracer, \
+ &tr->max_latency, &tracing_max_lat_fops)
+
+#endif
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -1536,6 +1619,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
/* record this tasks comm */
tracing_record_cmdline(tsk);
+ latency_fsnotify(tr);
}
/**
@@ -3225,6 +3309,9 @@ int trace_array_printk(struct trace_array *tr,
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
+ if (!tr)
+ return -ENOENT;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -3654,6 +3741,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
"desktop",
#elif defined(CONFIG_PREEMPT)
"preempt",
+#elif defined(CONFIG_PREEMPT_RT)
+ "preempt_rt",
#else
"unknown",
#endif
@@ -4609,7 +4698,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map)
- tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+ tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
sizeof(*tgid_map),
GFP_KERNEL);
if (!tgid_map) {
@@ -7583,14 +7672,23 @@ static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- unsigned long *p = filp->private_data;
- char buf[64]; /* Not too big for a shallow stack */
+ ssize_t ret;
+ char *buf;
int r;
- r = scnprintf(buf, 63, "%ld", *p);
- buf[r++] = '\n';
+ /* 256 should be plenty to hold the amount needed */
+ buf = kmalloc(256, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ ftrace_update_tot_cnt,
+ ftrace_number_of_pages,
+ ftrace_number_of_groups);
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ kfree(buf);
+ return ret;
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -8351,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr)
mutex_unlock(&trace_types_lock);
}
-struct trace_array *trace_array_create(const char *name)
+static struct trace_array *trace_array_create(const char *name)
{
struct trace_array *tr;
int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
-
- ret = -EEXIST;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
- }
-
ret = -ENOMEM;
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
- goto out_unlock;
+ return ERR_PTR(ret);
tr->name = kstrdup(name, GFP_KERNEL);
if (!tr->name)
@@ -8413,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name)
list_add(&tr->list, &ftrace_trace_arrays);
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ tr->ref++;
+
return tr;
@@ -8424,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name)
kfree(tr->name);
kfree(tr);
- out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
return ERR_PTR(ret);
}
-EXPORT_SYMBOL_GPL(trace_array_create);
static int instance_mkdir(const char *name)
{
- return PTR_ERR_OR_ZERO(trace_array_create(name));
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ ret = -EEXIST;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ tr = trace_array_create(name);
+
+ ret = PTR_ERR_OR_ZERO(tr);
+
+out_unlock:
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
+ return ret;
+}
+
+/**
+ * trace_array_get_by_name - Create/Lookup a trace array, given its name.
+ * @name: The name of the trace array to be looked up/created.
+ *
+ * Returns pointer to trace array with given name.
+ * NULL, if it cannot be created.
+ *
+ * NOTE: This function increments the reference counter associated with the
+ * trace array returned. This makes sure it cannot be freed while in use.
+ * Use trace_array_put() once the trace array is no longer needed.
+ *
+ */
+struct trace_array *trace_array_get_by_name(const char *name)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ tr = trace_array_create(name);
+
+ if (IS_ERR(tr))
+ tr = NULL;
+out_unlock:
+ if (tr)
+ tr->ref++;
+
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
+ return tr;
}
+EXPORT_SYMBOL_GPL(trace_array_get_by_name);
static int __remove_instance(struct trace_array *tr)
{
int i;
- if (tr->ref || (tr->current_trace && tr->current_trace->ref))
+ /* Reference counter for a newly created trace array = 1. */
+ if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
return -EBUSY;
list_del(&tr->list);
@@ -8473,17 +8615,26 @@ static int __remove_instance(struct trace_array *tr)
return 0;
}
-int trace_array_destroy(struct trace_array *tr)
+int trace_array_destroy(struct trace_array *this_tr)
{
+ struct trace_array *tr;
int ret;
- if (!tr)
+ if (!this_tr)
return -EINVAL;
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
- ret = __remove_instance(tr);
+ ret = -ENODEV;
+
+ /* Making sure trace array exists before destroying it. */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == this_tr) {
+ ret = __remove_instance(tr);
+ break;
+ }
+ }
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8585,8 +8736,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tr->max_latency, &tracing_max_lat_fops);
+ trace_create_maxlat_file(tr, d_tracer);
#endif
if (ftrace_create_function_files(tr, d_tracer))
@@ -8782,7 +8932,7 @@ static __init int tracer_init_tracefs(void)
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+ NULL, &tracing_dyn_info_fops);
#endif
create_trace_instances(d_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d685c61085c0..ca7fccafbcbb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,11 +11,14 @@
#include <linux/mmiotrace.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
+#include <linux/trace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/glob.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -264,6 +267,11 @@ struct trace_array {
#endif
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
unsigned long max_latency;
+#ifdef CONFIG_FSNOTIFY
+ struct dentry *d_max_latency;
+ struct work_struct fsnotify_work;
+ struct irq_work fsnotify_irqwork;
+#endif
#endif
struct trace_pid_list __rcu *filtered_pids;
/*
@@ -337,7 +345,6 @@ extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
-extern void trace_array_put(struct trace_array *tr);
extern int tracing_check_open_get_tr(struct trace_array *tr);
extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
@@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ defined(CONFIG_FSNOTIFY)
+
+void latency_fsnotify(struct trace_array *tr);
+
+#else
+
+static inline void latency_fsnotify(struct trace_array *tr) { }
+
+#endif
+
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
@@ -804,6 +822,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+extern unsigned long ftrace_number_of_pages;
+extern unsigned long ftrace_number_of_groups;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -853,8 +873,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...);
int trace_array_printk_buf(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
@@ -1870,7 +1888,6 @@ extern const char *__start___tracepoint_str[];
extern const char *__stop___tracepoint_str[];
void trace_printk_control(bool enabled);
-void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3ea65cdff30d..88e158d27965 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m)
return 0;
}
-static inline long get_incorrect_percent(struct ftrace_branch_data *p)
+static inline long get_incorrect_percent(const struct ftrace_branch_data *p)
{
long percent;
@@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx)
return p;
}
-static int annotated_branch_stat_cmp(void *p1, void *p2)
+static int annotated_branch_stat_cmp(const void *p1, const void *p2)
{
- struct ftrace_branch_data *a = p1;
- struct ftrace_branch_data *b = p2;
+ const struct ftrace_branch_data *a = p1;
+ const struct ftrace_branch_data *b = p2;
long percent_a, percent_b;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a9dfa04ffa44..643e0b19920d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"
@@ -26,8 +27,10 @@ static int total_ref_count;
static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
+ int ret;
+
if (tp_event->perf_perm) {
- int ret = tp_event->perf_perm(tp_event, p_event);
+ ret = tp_event->perf_perm(tp_event, p_event);
if (ret)
return ret;
}
@@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
if (!is_sampling_event(p_event))
return 0;
@@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
return 0;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fba87d10f0c1..6b3a69e9aa6a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
char *event = NULL, *sub = NULL, *match;
int ret;
+ if (!tr)
+ return -ENOENT;
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -825,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
/**
* trace_set_clr_event - enable or disable an event
@@ -850,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set)
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
+/**
+ * trace_array_set_clr_event - enable or disable an event for a trace array.
+ * @tr: concerned trace array.
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @enable: true to enable, false to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_array_set_clr_event(struct trace_array *tr, const char *system,
+ const char *event, bool enable)
+{
+ int set;
+
+ if (!tr)
+ return -ENOENT;
+
+ set = (enable == true) ? 1 : 0;
+ return __ftrace_set_clr_event(tr, NULL, system, event, set);
+}
+EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
+
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 7482a1466ebf..f49d1a36d3ae 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -23,7 +23,7 @@
#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 16
+#define SYNTH_FIELDS_MAX 32
#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 45630a76ed3a..2e6d2e9741cc 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
regfn) \
\
-struct trace_event_class __refdata event_class_ftrace_##call = { \
+static struct trace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
@@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \
.print_fmt = print, \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
-struct trace_event_call __used \
+static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#undef FTRACE_ENTRY
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 862f4b0139fc..6638d63f0921 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ * trace_hwlat.c - A simple Hardware Latency detector.
*
* Use this tracer to detect large system latencies induced by the behavior of
* certain underlying system hardware or firmware, independent of Linux itself.
@@ -237,6 +237,7 @@ static int get_sample(void)
/* If we exceed the threshold value, we have found a hardware latency */
if (sample > thresh || outer_sample > thresh) {
struct hwlat_sample s;
+ u64 latency;
ret = 1;
@@ -253,11 +254,13 @@ static int get_sample(void)
s.nmi_count = nmi_count;
trace_hwlat_sample(&s);
+ latency = max(sample, outer_sample);
+
/* Keep a running maximum ever recorded hardware latency */
- if (sample > tr->max_latency)
- tr->max_latency = sample;
- if (outer_sample > tr->max_latency)
- tr->max_latency = outer_sample;
+ if (latency > tr->max_latency) {
+ tr->max_latency = latency;
+ latency_fsnotify(tr);
+ }
}
out:
@@ -276,7 +279,7 @@ static void move_to_next_cpu(void)
return;
/*
* If for some reason the user modifies the CPU affinity
- * of this thread, than stop migrating for the duration
+ * of this thread, then stop migrating for the duration
* of the current test.
*/
if (!cpumask_equal(current_mask, current->cpus_ptr))
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1552a95c743b..7f890262c8a3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call,
#if defined(CONFIG_KPROBES_ON_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
-static bool within_notrace_func(struct trace_kprobe *tk)
+static bool __within_notrace_func(unsigned long addr)
{
- unsigned long offset, size, addr;
+ unsigned long offset, size;
- addr = trace_kprobe_address(tk);
if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
return false;
@@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk)
*/
return !ftrace_location_range(addr, addr + size - 1);
}
+
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+ unsigned long addr = addr = trace_kprobe_address(tk);
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (!__within_notrace_func(addr))
+ return false;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return true;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_notrace_func(addr);
+ }
+
+ return true;
+}
#else
#define within_notrace_func(tk) (false)
#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d54ce252b05a..d9b4b7c22db4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
}
EXPORT_SYMBOL(trace_print_array_seq);
+const char *
+trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str,
+ int prefix_type, int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_putc(p, '\n');
+ trace_seq_hex_dump(p, prefix_str, prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+EXPORT_SYMBOL(trace_print_hex_dump_seq);
+
int trace_raw_output_prep(struct trace_iterator *iter,
struct trace_event *trace_event)
{
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 6b1c562ffdaf..344e4c1aa09c 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
return seq_buf_to_user(&s->seq, ubuf, cnt);
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
+
+int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
+ int prefix_type, int rowsize, int groupsize,
+ const void *buf, size_t len, bool ascii)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return 0;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ seq_buf_hex_dump(&(s->seq), prefix_str,
+ prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(trace_seq_hex_dump);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 9ab0a1a7ad5e..874f1274cf99 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session)
kfree(session);
}
-typedef int (*cmp_stat_t)(void *, void *);
-
-static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
struct stat_node *data;
@@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
* This one will force an insertion as right-most node
* in the rbtree.
*/
-static int dummy_cmp(void *p1, void *p2)
+static int dummy_cmp(const void *p1, const void *p2)
{
return -1;
}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 8786d17caf49..31d7dc5bf1db 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -16,7 +16,7 @@ struct tracer_stat {
void *(*stat_start)(struct tracer_stat *trace);
void *(*stat_next)(void *prev, int idx);
/* Compare two entries for stats sorting */
- int (*stat_cmp)(void *p1, void *p2);
+ cmp_func_t stat_cmp;
/* Print a stat entry */
int (*stat_show)(struct seq_file *s, void *p);
/* Release an entry */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index fa8fbff736d6..16fa218556fa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
#include <linux/ftrace.h>
#include <linux/perf_event.h>
+#include <linux/xarray.h>
#include <asm/syscall.h>
#include "trace_output.h"
@@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call)
extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];
+static DEFINE_XARRAY(syscalls_metadata_sparse);
static struct syscall_metadata **syscalls_metadata;
#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
@@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall)
static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
+ if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
+ return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
+
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
return NULL;
@@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void)
struct syscall_metadata *meta;
unsigned long addr;
int i;
-
- syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
- GFP_KERNEL);
- if (!syscalls_metadata) {
- WARN_ON(1);
- return;
+ void *ret;
+
+ if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+ syscalls_metadata = kcalloc(NR_syscalls,
+ sizeof(*syscalls_metadata),
+ GFP_KERNEL);
+ if (!syscalls_metadata) {
+ WARN_ON(1);
+ return;
+ }
}
for (i = 0; i < NR_syscalls; i++) {
@@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void)
continue;
meta->syscall_nr = i;
- syscalls_metadata[i] = meta;
+
+ if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+ syscalls_metadata[i] = meta;
+ } else {
+ ret = xa_store(&syscalls_metadata_sparse, i, meta,
+ GFP_KERNEL);
+ WARN(xa_is_err(ret),
+ "Syscall memory allocation failed\n");
+ }
+
}
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 914b845ad4ff..bc88fd939f4e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -365,11 +365,6 @@ static void show_pwq(struct pool_workqueue *pwq);
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
-#define assert_rcu_or_wq_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
- !lockdep_is_held(&wq->mutex), \
- "RCU or wq->mutex should be held")
-
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex) && \
@@ -427,9 +422,7 @@ static void show_pwq(struct pool_workqueue *pwq);
*/
#define for_each_pwq(pwq, wq) \
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
- lockdep_is_held(&wq->mutex)) \
- if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
- else
+ lockdep_is_held(&(wq->mutex)))
#ifdef CONFIG_DEBUG_OBJECTS_WORK