aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile40
-rw-r--r--kernel/acct.c502
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c15
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c183
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c674
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c113
-rw-r--r--kernel/bpf/syscall.c621
-rw-r--r--kernel/bpf/verifier.c2146
-rw-r--r--kernel/capability.c39
-rw-r--r--kernel/cgroup.c823
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/compat.c29
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c95
-rw-r--r--kernel/cpu.c129
-rw-r--r--kernel/cpuset.c757
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/debug/debug_core.c69
-rw-r--r--kernel/debug/kdb/kdb_bp.c43
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c46
-rw-r--r--kernel/debug/kdb/kdb_main.c285
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/delayacct.c62
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c1651
-rw-r--r--kernel/events/hw_breakpoint.c15
-rw-r--r--kernel/events/internal.h33
-rw-r--r--kernel/events/ring_buffer.c330
-rw-r--r--kernel/events/uprobes.c24
-rw-r--r--kernel/exec_domain.c137
-rw-r--r--kernel/exit.c368
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c324
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c451
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/gcov/Makefile36
-rw-r--r--kernel/gcov/base.c5
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/groups.c14
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig18
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c234
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c41
-rw-r--r--kernel/irq/internals.h20
-rw-r--r--kernel/irq/irqdesc.c94
-rw-r--r--kernel/irq/irqdomain.c569
-rw-r--r--kernel/irq/manage.c169
-rw-r--r--kernel/irq/msi.c337
-rw-r--r--kernel/irq/pm.c164
-rw-r--r--kernel/irq/proc.c33
-rw-r--r--kernel/irq_work.c123
-rw-r--r--kernel/kallsyms.c13
-rw-r--r--kernel/kcmp.c7
-rw-r--r--kernel/kexec.c1309
-rw-r--r--kernel/kmod.c111
-rw-r--r--kernel/kprobes.c57
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1003
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/lockdep.c99
-rw-r--r--kernel/locking/locktorture.c529
-rw-r--r--kernel/locking/mcs_spinlock.h29
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c536
-rw-r--r--kernel/locking/mutex.h2
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)29
-rw-r--r--kernel/locking/qrwlock.c9
-rw-r--r--kernel/locking/rtmutex-debug.c5
-rw-r--r--kernel/locking/rtmutex-debug.h7
-rw-r--r--kernel/locking/rtmutex.c577
-rw-r--r--kernel/locking/rtmutex.h7
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-spinlock.c9
-rw-r--r--kernel/locking/rwsem-xadd.c132
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c341
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/nsproxy.c25
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c17
-rw-r--r--kernel/params.c129
-rw-r--r--kernel/pid.c22
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig25
-rw-r--r--kernel/power/hibernate.c28
-rw-r--r--kernel/power/main.c27
-rw-r--r--kernel/power/power.h11
-rw-r--r--kernel/power/process.c53
-rw-r--r--kernel/power/qos.c118
-rw-r--r--kernel/power/snapshot.c518
-rw-r--r--kernel/power/suspend.c239
-rw-r--r--kernel/power/suspend_test.c65
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/console_cmdline.h2
-rw-r--r--kernel/printk/printk.c373
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c71
-rw-r--r--kernel/range.c10
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcutorture.c372
-rw-r--r--kernel/rcu/srcu.c25
-rw-r--r--kernel/rcu/tiny.c143
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c1042
-rw-r--r--kernel/rcu/tree.h112
-rw-r--r--kernel/rcu/tree_plugin.h1238
-rw-r--r--kernel/rcu/tree_trace.c12
-rw-r--r--kernel/rcu/update.c507
-rw-r--r--kernel/reboot.c134
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/resource.c261
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c11
-rw-r--r--kernel/sched/clock.c15
-rw-r--r--kernel/sched/completion.c36
-rw-r--r--kernel/sched/core.c1147
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c29
-rw-r--r--kernel/sched/cpudeadline.h5
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c364
-rw-r--r--kernel/sched/debug.c37
-rw-r--r--kernel/sched/fair.c1530
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c85
-rw-r--r--kernel/sched/idle_task.c7
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/sched/rt.c279
-rw-r--r--kernel/sched/sched.h303
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/sched/stop_task.c7
-rw-r--r--kernel/sched/wait.c132
-rw-r--r--kernel/seccomp.c681
-rw-r--r--kernel/signal.c116
-rw-r--r--kernel/smp.c117
-rw-r--r--kernel/smpboot.c173
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys.c543
-rw-r--r--kernel/sys_ni.c30
-rw-r--r--kernel/sysctl.c110
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/system_keyring.c1
-rw-r--r--kernel/taskstats.c17
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/Kconfig15
-rw-r--r--kernel/time/Makefile27
-rw-r--r--kernel/time/alarmtimer.c38
-rw-r--r--kernel/time/clockevents.c233
-rw-r--r--kernel/time/clocksource.c261
-rw-r--r--kernel/time/hrtimer.c (renamed from kernel/hrtimer.c)265
-rw-r--r--kernel/time/itimer.c (renamed from kernel/itimer.c)0
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c40
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c (renamed from kernel/posix-cpu-timers.c)19
-rw-r--r--kernel/time/posix-timers.c (renamed from kernel/posix-timers.c)3
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/test_udelay.c168
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c11
-rw-r--r--kernel/time/tick-broadcast.c181
-rw-r--r--kernel/time/tick-common.c127
-rw-r--r--kernel/time/tick-internal.h216
-rw-r--r--kernel/time/tick-oneshot.c8
-rw-r--r--kernel/time/tick-sched.c122
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/time.c (renamed from kernel/time.c)143
-rw-r--r--kernel/time/timeconst.bc (renamed from kernel/timeconst.bc)0
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c1526
-rw-r--r--kernel/time/timekeeping.h29
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timekeeping_internal.h17
-rw-r--r--kernel/time/timer.c (renamed from kernel/timer.c)188
-rw-r--r--kernel/time/timer_list.c34
-rw-r--r--kernel/torture.c34
-rw-r--r--kernel/trace/Kconfig41
-rw-r--r--kernel/trace/Makefile8
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/bpf_trace.c222
-rw-r--r--kernel/trace/ftrace.c1206
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c275
-rw-r--r--kernel/trace/ring_buffer_benchmark.c21
-rw-r--r--kernel/trace/trace.c1053
-rw-r--r--kernel/trace/trace.h39
-rw-r--r--kernel/trace/trace_branch.c46
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events.c351
-rw-r--r--kernel/trace/trace_events_filter.c102
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c483
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kdb.c25
-rw-r--r--kernel/trace/trace_kprobe.c77
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_nop.c2
-rw-r--r--kernel/trace/trace_output.c772
-rw-r--r--kernel/trace/trace_output.h20
-rw-r--r--kernel/trace/trace_printk.c6
-rw-r--r--kernel/trace/trace_probe.c29
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_sched_switch.c146
-rw-r--r--kernel/trace/trace_sched_wakeup.c58
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_seq.c377
-rw-r--r--kernel/trace/trace_stack.c12
-rw-r--r--kernel/trace/trace_stat.c12
-rw-r--r--kernel/trace/trace_syscalls.c70
-rw-r--r--kernel/trace/trace_uprobe.c50
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c159
-rw-r--r--kernel/utsname.c37
-rw-r--r--kernel/watchdog.c346
-rw-r--r--kernel/workqueue.c1168
243 files changed, 29098 insertions, 12085 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
def_bool y
depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+ def_bool y
+ depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
config ARCH_USE_QUEUE_RWLOCK
bool
diff --git a/kernel/Makefile b/kernel/Makefile
index f2a8b6246ce9..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,19 +3,20 @@
#
obj-y = fork.o exec_domain.o panic.o \
- cpu.o exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
+ cpu.o exit.o softirq.o resource.o \
+ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
- extable.o params.o posix-timers.o \
- kthread.o sys_ni.o posix-cpu-timers.o \
- hrtimer.o nsproxy.o \
+ extable.o params.o \
+ kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o groups.o smpboot.o
+ async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_irq_work.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
# cond_syscall is currently not LTO compatible
@@ -27,6 +28,7 @@ obj-y += power/
obj-y += printk/
obj-y += irq/
obj-y += rcu/
+obj-y += livepatch/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -58,7 +60,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -87,6 +88,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -105,27 +107,11 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
###############################################################################
#
# Roll all the X.509 certificates that we can find together and pull them into
@@ -159,7 +145,7 @@ endif
kernel/system_certificates.o: $(obj)/x509_certificate_list
quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
targets += $(obj)/x509_certificate_list
$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 808a86ff229d..74963d192c5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
@@ -75,172 +76,186 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock. This primes
- * the cache line to have the data after getting the lock.
- */
struct bsd_acct_struct {
+ struct fs_pin pin;
+ atomic_long_t count;
+ struct rcu_head rcu;
+ struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct list_head list;
+ struct work_struct work;
+ struct completion done;
};
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
+static void do_acct_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- int res;
- int act;
- u64 resume;
- u64 suspend;
-
- spin_lock(&acct_lock);
- res = acct->active;
- if (!file || time_is_before_jiffies(acct->needcheck))
+
+ if (time_is_before_jiffies(acct->needcheck))
goto out;
- spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(&file->f_path, &sbuf))
- return res;
- suspend = sbuf.f_blocks * SUSPEND;
- resume = sbuf.f_blocks * RESUME;
-
- do_div(suspend, 100);
- do_div(resume, 100);
-
- if (sbuf.f_bavail <= suspend)
- act = -1;
- else if (sbuf.f_bavail >= resume)
- act = 1;
- else
- act = 0;
-
- /*
- * If some joker switched acct->file under us we'ld better be
- * silent and _not_ touch anything.
- */
- spin_lock(&acct_lock);
- if (file != acct->file) {
- if (act)
- res = act > 0;
+ if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
- }
if (acct->active) {
- if (act < 0) {
+ u64 suspend = sbuf.f_blocks * SUSPEND;
+ do_div(suspend, 100);
+ if (sbuf.f_bavail <= suspend) {
acct->active = 0;
- printk(KERN_INFO "Process accounting paused\n");
+ pr_info("Process accounting paused\n");
}
} else {
- if (act > 0) {
+ u64 resume = sbuf.f_blocks * RESUME;
+ do_div(resume, 100);
+ if (sbuf.f_bavail >= resume) {
acct->active = 1;
- printk(KERN_INFO "Process accounting resumed\n");
+ pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- res = acct->active;
out:
- spin_unlock(&acct_lock);
- return res;
+ return acct->active;
}
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
- struct pid_namespace *ns)
+static void acct_put(struct bsd_acct_struct *p)
{
- struct file *old_acct = NULL;
- struct pid_namespace *old_ns = NULL;
-
- if (acct->file) {
- old_acct = acct->file;
- old_ns = acct->ns;
- acct->active = 0;
- acct->file = NULL;
- acct->ns = NULL;
- list_del(&acct->list);
+ if (atomic_long_dec_and_test(&p->count))
+ kfree_rcu(p, rcu);
+}
+
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+ return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *res;
+again:
+ smp_rmb();
+ rcu_read_lock();
+ res = to_acct(ACCESS_ONCE(ns->bacct));
+ if (!res) {
+ rcu_read_unlock();
+ return NULL;
}
- if (file) {
- acct->file = file;
- acct->ns = ns;
- acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- acct->active = 1;
- list_add(&acct->list, &acct_list);
+ if (!atomic_long_inc_not_zero(&res->count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto again;
}
- if (old_acct) {
- mnt_unpin(old_acct->f_path.mnt);
- spin_unlock(&acct_lock);
- do_acct_process(acct, old_ns, old_acct);
- filp_close(old_acct, NULL);
- spin_lock(&acct_lock);
+ rcu_read_unlock();
+ mutex_lock(&res->lock);
+ if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ mutex_unlock(&res->lock);
+ acct_put(res);
+ goto again;
}
+ return res;
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct = to_acct(pin);
+ mutex_lock(&acct->lock);
+ do_acct_process(acct);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ cmpxchg(&acct->ns->bacct, pin, NULL);
+ mutex_unlock(&acct->lock);
+ pin_remove(pin);
+ acct_put(acct);
+}
+
+static void close_work(struct work_struct *work)
+{
+ struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+ struct file *file = acct->file;
+ if (file->f_op->flush)
+ file->f_op->flush(file, NULL);
+ __fput_sync(file);
+ complete(&acct->done);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
- struct vfsmount *mnt;
- struct pid_namespace *ns;
- struct bsd_acct_struct *acct = NULL;
+ struct vfsmount *mnt, *internal;
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct bsd_acct_struct *acct;
+ struct fs_pin *old;
+ int err;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ kfree(acct);
return PTR_ERR(file);
+ }
if (!S_ISREG(file_inode(file)->i_mode)) {
+ kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
- if (!file->f_op->write) {
+ if (!(file->f_mode & FMODE_CAN_WRITE)) {
+ kfree(acct);
filp_close(file, NULL);
return -EIO;
}
-
- ns = task_active_pid_ns(current);
- if (ns->bacct == NULL) {
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (acct == NULL) {
- filp_close(file, NULL);
- return -ENOMEM;
- }
+ internal = mnt_clone_internal(&file->f_path);
+ if (IS_ERR(internal)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return PTR_ERR(internal);
}
-
- spin_lock(&acct_lock);
- if (ns->bacct == NULL) {
- ns->bacct = acct;
- acct = NULL;
+ err = mnt_want_write(internal);
+ if (err) {
+ mntput(internal);
+ kfree(acct);
+ filp_close(file, NULL);
+ return err;
}
-
mnt = file->f_path.mnt;
- mnt_pin(mnt);
- acct_file_reopen(ns->bacct, file, ns);
- spin_unlock(&acct_lock);
-
- mntput(mnt); /* it's pinned, now give up active reference */
- kfree(acct);
+ file->f_path.mnt = internal;
+
+ atomic_long_set(&acct->count, 1);
+ init_fs_pin(&acct->pin, acct_pin_kill);
+ acct->file = file;
+ acct->needcheck = jiffies;
+ acct->ns = ns;
+ mutex_init(&acct->lock);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
+ mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
+ pin_insert(&acct->pin, mnt);
+ rcu_read_lock();
+ old = xchg(&ns->bacct, &acct->pin);
+ mutex_unlock(&acct->lock);
+ pin_kill(old);
+ mnt_drop_write(mnt);
+ mntput(mnt);
return 0;
}
+static DEFINE_MUTEX(acct_on_mutex);
+
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +276,25 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (name) {
struct filename *tmp = getname(name);
+
if (IS_ERR(tmp))
return PTR_ERR(tmp);
+ mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
+ mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- struct bsd_acct_struct *acct;
-
- acct = task_active_pid_ns(current)->bacct;
- if (acct == NULL)
- return 0;
-
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
+ rcu_read_lock();
+ pin_kill(task_active_pid_ns(current)->bacct);
}
return error;
}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off. Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt == m) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct = ns->bacct;
-
- if (acct == NULL)
- return;
-
- spin_lock(&acct_lock);
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
-
- kfree(acct);
+ rcu_read_lock();
+ pin_kill(ns->bacct);
}
/*
@@ -376,7 +336,7 @@ static comp_t encode_comp_t(unsigned long value)
return exp;
}
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
@@ -389,7 +349,7 @@ static comp_t encode_comp_t(unsigned long value)
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
@@ -420,7 +380,7 @@ static comp2_t encode_comp2_t(u64 value)
}
#endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -429,8 +389,9 @@ static u32 encode_float(u64 value)
unsigned exp = 190;
unsigned u;
- if (value==0) return 0;
- while ((s64)value > 0){
+ if (value == 0)
+ return 0;
+ while ((s64)value > 0) {
value <<= 1;
exp--;
}
@@ -448,120 +409,116 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
- acct_t ac;
- mm_segment_t fs;
- unsigned long flim;
- u64 elapsed;
- u64 run_time;
- struct timespec uptime;
+ u64 elapsed, run_time;
struct tty_struct *tty;
- const struct cred *orig_cred;
-
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct, file))
- goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset(&ac, 0, sizeof(acct_t));
+ memset(ac, 0, sizeof(acct_t));
- ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+ ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
- do_posix_clock_monotonic_gettime(&uptime);
- run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
- run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
- + current->group_leader->start_time.tv_nsec;
+ run_time = ktime_get_ns();
+ run_time -= current->group_leader->start_time;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
- ac.ac_etime = encode_float(elapsed);
+#if ACCT_VERSION == 3
+ ac->ac_etime = encode_float(elapsed);
#else
- ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- (unsigned long) elapsed : (unsigned long) -1l);
+ ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
- ac.ac_etime_hi = etime >> 16;
- ac.ac_etime_lo = (u16) etime;
+
+ ac->ac_etime_hi = etime >> 16;
+ ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
- ac.ac_btime = get_seconds() - elapsed;
- /* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_btime = get_seconds() - elapsed;
#if ACCT_VERSION==2
- ac.ac_ahz = AHZ;
-#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
- /* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION==3
- ac.ac_pid = task_tgid_nr_ns(current, ns);
- rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
- rcu_read_unlock();
+ ac->ac_ahz = AHZ;
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
- ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
- ac.ac_flag = pacct->ac_flag;
- ac.ac_mem = encode_comp_t(pacct->ac_mem);
- ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
- ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
- ac.ac_exitcode = pacct->ac_exitcode;
+ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_flag = pacct->ac_flag;
+ ac->ac_mem = encode_comp_t(pacct->ac_mem);
+ ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
- ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
- ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_swaps = encode_comp_t(0);
+}
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ acct_t ac;
+ unsigned long flim;
+ const struct cred *orig_cred;
+ struct file *file = acct->file;
/*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
+ * Accounting records are not subject to resource limits.
*/
- if (!file_start_write_trylock(file))
- goto out;
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
+
/*
- * Kernel segment override to datasegment and write it
- * to the accounting file.
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
*/
- fs = get_fs();
- set_fs(KERNEL_DS);
+ if (!check_free_space(acct))
+ goto out;
+
+ fill_ac(&ac);
+ /* we really need to bite the bullet and change layout */
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
+ /* backward-compatible 16 bit fields */
+ ac.ac_uid16 = ac.ac_uid;
+ ac.ac_gid16 = ac.ac_gid;
+#endif
+#if ACCT_VERSION == 3
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
+#endif
/*
- * Accounting records are not subject to resource limits.
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- file->f_op->write(file, (char *)&ac,
- sizeof(acct_t), &file->f_pos);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- set_fs(fs);
- file_end_write(file);
+ if (file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
out:
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
@@ -578,6 +535,7 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
+
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
@@ -609,34 +567,20 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
{
- struct file *file = NULL;
- struct bsd_acct_struct *acct;
-
- acct = ns->bacct;
- /*
- * accelerate the common fastpath:
- */
- if (!acct || !acct->file)
- return;
-
- spin_lock(&acct_lock);
- file = acct->file;
- if (unlikely(!file)) {
- spin_unlock(&acct_lock);
- return;
+ for ( ; ns; ns = ns->parent) {
+ struct bsd_acct_struct *acct = acct_get(ns);
+ if (acct) {
+ do_acct_process(acct);
+ mutex_unlock(&acct->lock);
+ acct_put(acct);
+ }
}
- get_file(file);
- spin_unlock(&acct_lock);
-
- do_acct_process(acct, ns, file);
- fput(file);
}
/**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
*
* handles process accounting for an exiting task
*/
@@ -649,6 +593,10 @@ void acct_process(void)
* alive and holds its namespace, which in turn holds
* its parent.
*/
- for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
- acct_process_in_ns(ns);
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+ if (ns->bacct)
+ break;
+ }
+ if (unlikely(ns))
+ slow_acct_process(ns);
}
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+ pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index d5a1220c8620..ab5745ddf962 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -501,7 +501,6 @@ static int kauditd_thread(void *dummy)
set_freezable();
while (!kthread_should_stop()) {
struct sk_buff *skb;
- DECLARE_WAITQUEUE(wait, current);
flush_hold_queue();
@@ -516,16 +515,8 @@ static int kauditd_thread(void *dummy)
audit_printk_skb(skb);
continue;
}
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
}
return 0;
}
@@ -1110,7 +1101,7 @@ static void audit_receive(struct sk_buff *skb)
}
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(int group)
+static int audit_bind(struct net *net, int group)
{
if (!capable(CAP_AUDIT_READ))
return -EPERM;
@@ -1669,7 +1660,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
audit_log_format(ab, " %s=", prefix);
CAP_FOR_EACH_U32(i) {
audit_log_format(ab, "%08x",
- cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ cap->cap[CAP_LAST_U32 - i]);
}
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 28067c50fd04..d641f9bb3ed0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
-/* 0 = no checking
- 1 = put_count checking
- 2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
};
};
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
@@ -86,7 +79,6 @@ struct audit_names {
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
- bool name_put; /* call __putname()? */
unsigned long ino;
dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
};
int fds[2];
struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
};
extern u32 audit_ever_enabled;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 415072c8e875..71fd1f289885 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -175,9 +175,9 @@ static void insert_hash(struct audit_chunk *chunk)
struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
- if (!entry->i.inode)
+ if (!entry->inode)
return;
- list = chunk_hash(entry->i.inode);
+ list = chunk_hash(entry->inode);
list_add_rcu(&chunk->hash, list);
}
@@ -189,7 +189,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
/* mark.inode may have gone NULL, but who cares? */
- if (p->mark.i.inode == inode) {
+ if (p->mark.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -232,7 +232,7 @@ static void untag_chunk(struct node *p)
new = alloc_chunk(size);
spin_lock(&entry->lock);
- if (chunk->dead || !entry->i.inode) {
+ if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
if (new)
free_chunk(new);
@@ -259,7 +259,7 @@ static void untag_chunk(struct node *p)
goto Fallback;
fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -387,7 +387,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
spin_lock(&old_entry->lock);
- if (!old_entry->i.inode) {
+ if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
fsnotify_put_mark(old_entry);
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
}
fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -612,7 +612,7 @@ void audit_trim_trees(void)
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.i.inode;
+ struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 81c94d739e3f..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -121,7 +121,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
if (unlikely(!entry))
return NULL;
- fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+ fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
if (unlikely(!fields)) {
kfree(entry);
return NULL;
@@ -175,7 +175,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
{
- __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4b89f7f95d84..84c74d08c62b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
-#if AUDIT_DEBUG == 2
- if (context->put_count + context->ino_count != context->name_count) {
- int i = 0;
-
- pr_err("%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d ino_count=%d"
- " [NOT freeing]\n", __FILE__, __LINE__,
- context->serial, context->major, context->in_syscall,
- context->name_count, context->put_count,
- context->ino_count);
- list_for_each_entry(n, &context->names_list, list) {
- pr_err("names[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
- dump_stack();
- return;
- }
-#endif
-#if AUDIT_DEBUG
- context->put_count = 0;
- context->ino_count = 0;
-#endif
-
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
- if (n->name && n->name_put)
- final_putname(n->name);
+ if (n->name)
+ putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
return aname;
}
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr)
+ if (n->name->uptr == uptr) {
+ n->name->refcnt++;
return n->name;
+ }
}
return NULL;
}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
struct audit_context *context = current->audit_context;
struct audit_names *n;
- if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): ignoring getname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- dump_stack();
-#endif
+ if (!context->in_syscall)
return;
- }
-
-#if AUDIT_DEBUG
- /* The filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
- n->name_put = true;
name->aname = n;
+ name->refcnt++;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
- struct audit_context *context = current->audit_context;
-
- BUG_ON(!context);
- if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): final_putname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- if (context->name_count) {
- struct audit_names *n;
- int i = 0;
-
- list_for_each_entry(n, &context->names_list, list)
- pr_err("name[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
-#endif
- final_putname(name);
- }
-#if AUDIT_DEBUG
- else {
- ++context->put_count;
- if (context->put_count > context->name_count) {
- pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
- " name_count=%d put_count=%d\n",
- __FILE__, __LINE__,
- context->serial, context->major,
- context->in_syscall, name->name,
- context->name_count, context->put_count);
- dump_stack();
- }
- }
-#endif
-}
-
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
if (!name)
goto out_alloc;
-#if AUDIT_DEBUG
- /* The struct filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
/*
* If we have a pointer to an audit_names entry already, then we can
* just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (!n->name || strcmp(n->name->name, name->name))
+ if (n->ino) {
+ /* valid inode number, use that for the comparison */
+ if (n->ino != inode->i_ino ||
+ n->dev != inode->i_sb->s_dev)
+ continue;
+ } else if (n->name) {
+ /* inode number has not been set, check the name */
+ if (strcmp(n->name->name, name->name))
+ continue;
+ } else
+ /* no inode and no name (?!) ... this is odd ... */
continue;
/* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
- /* unfortunately, while we may have a path name to record with the
- * inode, we can't always rely on the string lasting until the end of
- * the syscall so we need to create our own copy, it may fail due to
- * memory allocation issues, but we do our best */
if (name) {
- /* we can't use getname_kernel() due to size limits */
- size_t len = strlen(name->name) + 1;
- struct filename *new = __getname();
-
- if (unlikely(!new))
- goto out;
-
- if (len <= (PATH_MAX - sizeof(*new))) {
- new->name = (char *)(new) + sizeof(*new);
- new->separate = false;
- } else if (len <= PATH_MAX) {
- /* this looks odd, but is due to final_putname() */
- struct filename *new2;
-
- new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
- if (unlikely(!new2)) {
- __putname(new);
- goto out;
- }
- new2->name = (char *)new;
- new2->separate = true;
- new = new2;
- } else {
- /* we should never get here, but let's be safe */
- __putname(new);
- goto out;
- }
- strlcpy((char *)new->name, name->name, len);
- new->uptr = NULL;
- new->aname = n;
- n->name = new;
- n->name_put = true;
+ n->name = name;
+ name->refcnt++;
}
+
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1934,6 +1829,11 @@ out:
audit_copy_inode(n, dentry, inode);
}
+void __audit_file(const struct file *file)
+{
+ __audit_inode(NULL, file->f_path.dentry, 0);
+}
+
/**
* __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
@@ -1965,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
/* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name || n->type != AUDIT_TYPE_PARENT)
+ if (!n->name ||
+ (n->type != AUDIT_TYPE_PARENT &&
+ n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname,
+ n->name->name, n->name_len)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
break;
}
@@ -1978,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
- if (!n->name || n->type != type)
- continue;
-
- /* if we found a parent, make sure this one is a child of it */
- if (found_parent && (n->name != found_parent->name))
+ if (!n->name ||
+ (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (!strcmp(dname, n->name->name) ||
@@ -1990,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
found_parent ?
found_parent->name_len :
AUDIT_NAME_FULL)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = type;
found_child = n;
break;
}
@@ -2014,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- /* don't call __putname() */
- found_child->name_put = false;
+ found_child->name->refcnt++;
}
}
+
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
@@ -2400,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
struct audit_aux_data_bprm_fcaps *ax;
struct audit_context *context = current->audit_context;
struct cpu_vfs_cap_data vcaps;
- struct dentry *dentry;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
@@ -2410,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_dentry);
- get_vfs_caps_from_disk(dentry, &vcaps);
- dput(dentry);
+ get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
@@ -2434,7 +2335,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
* @new: the new credentials
* @old: the old (current) credentials
*
- * Record the aguments userspace sent to sys_capset for later printing by the
+ * Record the arguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
- DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..e6983be12bd3
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1,2 @@
+obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..8a6616583f38
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size, array_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static const struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list array_type __read_mostly = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&array_type);
+ return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..4139a0f8b558
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,674 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
+#include <asm/unaligned.h>
+#include <linux/bpf.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+ return ptr;
+
+ return NULL;
+}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *fp;
+
+ size = round_up(size, PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp == NULL)
+ return NULL;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
+ vfree(fp);
+ return NULL;
+ }
+
+ fp->pages = size / PAGE_SIZE;
+ fp->aux = aux;
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ BUG_ON(fp_old == NULL);
+
+ size = round_up(size, PAGE_SIZE);
+ if (size <= fp_old->pages * PAGE_SIZE)
+ return fp_old;
+
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+ fp->pages = size / PAGE_SIZE;
+
+ /* We keep fp->aux from fp_old around in the new
+ * reallocated structure.
+ */
+ fp_old->aux = NULL;
+ __bpf_prog_free(fp_old);
+ }
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+ kfree(fp->aux);
+ vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *hdr;
+ unsigned int size, hole, start;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ hdr = module_alloc(size);
+ if (hdr == NULL)
+ return NULL;
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->pages = size / PAGE_SIZE;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (prandom_u32() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+ module_memfree(hdr);
+}
+#endif /* CONFIG_BPF_JIT */
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+/**
+ * __bpf_prog_run - run eBPF program on a given context
+ * @ctx: is the data we are operating on
+ * @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
+ };
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ LD_IMM_DW:
+ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+ insn++;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
+/**
+ * bpf_prog_select_runtime - select execution runtime for BPF program
+ * @fp: bpf_prog populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via BPF_PROG_RUN() macro
+ */
+void bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+ fp->bpf_func = (void *) __bpf_prog_run;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+ /* Lock whole bpf_prog as read-only */
+ bpf_prog_lock_ro(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
+}
+
+/* Free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ aux->prog = fp;
+ schedule_work(&aux->work);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+ int len)
+{
+ return -EFAULT;
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..83c209d9b17a
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static const struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_type __read_mostly = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&htab_type);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..bd7f5988ed9c
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,113 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+const struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+ .func = bpf_get_prandom_u32,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+ .func = bpf_get_smp_processor_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..3bae6c591914
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,621 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+#include <linux/version.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value, *ptr;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, map->value_size);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!ptr)
+ goto free_value;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto free_value;
+
+ err = 0;
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->type = type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ free_used_maps(prog->aux);
+ bpf_prog_free(prog);
+ }
+}
+EXPORT_SYMBOL_GPL(bpf_prog_put);
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->aux->refcnt);
+ fdput(f);
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (type == BPF_PROG_TYPE_KPROBE &&
+ attr->kern_version != LINUX_VERSION_CODE)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ err = bpf_check(&prog, attr);
+ if (err < 0)
+ goto free_used_maps;
+
+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, uattr, 1))
+ return -EFAULT;
+
+ if (size > PAGE_SIZE) /* silly large */
+ return -E2BIG;
+
+ /* If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+ size = sizeof(attr);
+ }
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..47dcd3aa6e23
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,2146 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* register spilled into stack */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
+ struct verifier_state_list **explored_states; /* search pruning optimization */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ }
+ verbose("\n");
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ /* save register state */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ state->regs[value_regno];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ } else {
+ /* regular write of data into stack */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ u8 *slot_type;
+ int i;
+
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (slot_type[i] != STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size, err = 0;
+
+ size = bpf_size_to_bytes(bpf_size);
+ if (size < 0)
+ return size;
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ err = check_map_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ err = check_ctx_access(env, off, size, t);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ err = check_stack_write(state, off, size, value_regno);
+ else
+ err = check_stack_read(state, off, size, value_regno);
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return err;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can read the memory */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can write into the same memory */
+ return check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+ int err = 0;
+
+ if (arg_type == ARG_DONTCARE)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else if (arg_type == ARG_PTR_TO_CTX) {
+ expected_type = PTR_TO_CTX;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno - 1, reg->imm);
+ }
+
+ return err;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i, err;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ if (err)
+ return err;
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
+static bool may_access_skb(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (!may_access_skb(env->prog->type)) {
+ verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack; /* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return 0;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return 0;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose("jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH)
+ /* mark branch target for state pruning */
+ env->explored_states[w] = STATE_LIST_MARK;
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[cur_stack++] = w;
+ return 1;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose("back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verbose("insn state internal bug\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret = 0;
+ int i, t;
+
+ insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_stack) {
+ kfree(insn_state);
+ return -ENOMEM;
+ }
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ cur_stack = 1;
+
+peek_stack:
+ if (cur_stack == 0)
+ goto check_state;
+ t = insn_stack[cur_stack - 1];
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + insns[t].off + 1,
+ FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
+ } else {
+ /* conditional jump with two edges */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+
+mark_explored:
+ insn_state[t] = EXPLORED;
+ if (cur_stack-- <= 0) {
+ verbose("pop stack internal bug\n");
+ ret = -EFAULT;
+ goto err_free;
+ }
+ goto peek_stack;
+
+check_state:
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn_state[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ }
+ ret = 0; /* cfg looks good */
+
+err_free:
+ kfree(insn_state);
+ kfree(insn_stack);
+ return ret;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (memcmp(&old->regs[i], &cur->regs[i],
+ sizeof(old->regs[0])) != 0) {
+ if (old->regs[i].type == NOT_INIT ||
+ (old->regs[i].type == UNKNOWN_VALUE &&
+ cur->regs[i].type != NOT_INIT))
+ continue;
+ return false;
+ }
+ }
+
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
+ }
+ return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *new_sl;
+ struct verifier_state_list *sl;
+
+ sl = env->explored_states[insn_idx];
+ if (!sl)
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+ return 0;
+
+ while (sl != STATE_LIST_MARK) {
+ if (states_equal(&sl->state, &env->cur_state))
+ /* reached equivalent register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+
+ /* there were no equivalent states, remember current one.
+ * technically the current state is not proven to be safe yet,
+ * but it will either reach bpf_exit (which means it's safe) or
+ * it will be rejected. Since there are no loops, we won't be
+ * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ */
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ if (!new_sl)
+ return -ENOMEM;
+
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->explored_states[insn_idx];
+ env->explored_states[insn_idx] = new_sl;
+ return 0;
+}
+
+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+ int err;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ err = is_state_visited(env, insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (log_level) {
+ if (do_print_state)
+ verbose("\nfrom %d to %d: safe\n",
+ prev_insn_idx, insn_idx);
+ else
+ verbose("%d: safe\n", insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(regs, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ enum bpf_reg_type src_reg_type;
+
+ /* check for reserved fields is already done */
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ err = check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg);
+ if (err)
+ return err;
+
+ if (BPF_SIZE(insn->code) != BPF_W) {
+ insn_idx++;
+ continue;
+ }
+
+ if (insn->imm == 0) {
+ /* saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * use reserved 'imm' field to mark this insn
+ */
+ insn->imm = src_reg_type;
+
+ } else if (src_reg_type != insn->imm &&
+ (src_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ /* ABuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ err = check_xadd(env, insn);
+ if (err)
+ return err;
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1);
+ if (err)
+ return err;
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_call(env, insn->imm);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+process_bpf_exit:
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ err = check_cond_jmp_op(env, insn, &insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0)) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ int insn_cnt = prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) != BPF_JMP ||
+ BPF_OP(insn->code) == BPF_CALL ||
+ BPF_OP(insn->code) == BPF_EXIT)
+ continue;
+
+ /* adjust offset of jmps if necessary */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos && i + insn->off + 1 < pos)
+ insn->off -= delta;
+ }
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ u32 cnt;
+ int i;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ continue;
+
+ if (insn->imm != PTR_TO_CTX) {
+ /* clear internal mark */
+ insn->imm = 0;
+ continue;
+ }
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (cnt == 1) {
+ memcpy(insn, insn_buf, sizeof(*insn));
+ continue;
+ }
+
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;
+
+ new_prog->len = insn_cnt;
+
+ memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+ sizeof(*insn) * (insn_cnt - i - cnt));
+
+ /* copy substitute insns in place of load instruction */
+ memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+ /* adjust branches in the whole program */
+ adjust_branches(new_prog, i, cnt - 1);
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + cnt - 1;
+ i += cnt - 1;
+ }
+
+ return 0;
+}
+
+static void free_states(struct verifier_env *env)
+{
+ struct verifier_state_list *sl, *sln;
+ int i;
+
+ if (!env->explored_states)
+ return;
+
+ for (i = 0; i < env->prog->len; i++) {
+ sl = env->explored_states[i];
+
+ if (sl)
+ while (sl != STATE_LIST_MARK) {
+ sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->explored_states);
+}
+
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
+ int ret = -EINVAL;
+
+ if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->prog = *prog;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct verifier_state_list *),
+ GFP_USER);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ if (ret == 0)
+ /* program is valid, convert *(u32*)(ctx + off) accesses */
+ ret = convert_ctx_accesses(env);
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!env->prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(env->prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ env->prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ if (!env->prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
+ *prog = env->prog;
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
+ return ret;
+}
diff --git a/kernel/capability.c b/kernel/capability.c
index a5cf13c018ce..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
}
__setup("no_file_caps", file_caps_disable);
+#ifdef CONFIG_MULTIUSER
/*
* More recent versions of libcap are available from:
*
@@ -258,6 +259,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
i++;
}
+ effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -382,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
}
EXPORT_SYMBOL(ns_capable);
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
@@ -408,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
- return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
-/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..469dd547770c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
*/
static bool cgrp_dfl_root_visible;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
/* some controllers are not supported in the default hierarchy */
-static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
-#ifdef CONFIG_CGROUP_DEBUG
- | (1 << debug_cgrp_id)
-#endif
- ;
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -180,18 +182,18 @@ static u64 css_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -275,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Use ->child_subsys_mask.
+ */
while (cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp);
@@ -282,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -327,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
return false;
}
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
- const int bits =
- (1 << CGRP_RELEASABLE) |
- (1 << CGRP_NOTIFY_ON_RELEASE);
- return (cgrp->flags & bits) == bits;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -390,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
@@ -494,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -520,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
- if (notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
}
kfree(link);
@@ -533,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
@@ -544,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
return;
down_write(&css_set_rwsem);
- put_css_set_locked(cset, taskexit);
+ put_css_set_locked(cset);
up_write(&css_set_rwsem);
}
@@ -965,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
@@ -1031,12 +1045,81 @@ static void cgroup_get(struct cgroup *cgrp)
css_get(&cgrp->self);
}
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
static void cgroup_put(struct cgroup *cgrp)
{
css_put(&cgrp->self);
}
/**
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
+ * @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied to @cgrp. The returned mask is always
+ * a superset of @subtree_control and follows the usual hierarchy rules.
+ */
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned int subtree_control)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp))
+ return cur_ss_mask;
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ cgrp->child_subsys_mask =
+ cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
+}
+
+/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1091,7 +1174,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
*/
- cgroup_get(cgrp);
+ if (!cgroup_tryget(cgrp))
+ return NULL;
kernfs_break_active_protection(kn);
mutex_lock(&cgroup_mutex);
@@ -1208,12 +1292,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
up_write(&css_set_rwsem);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root)
- dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
if (ss->bind)
ss->bind(css);
@@ -1233,8 +1320,6 @@ static int cgroup_show_options(struct seq_file *seq,
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1353,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
bool all_ss = false, one_ss = false;
unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
@@ -1277,6 +1363,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
@@ -1361,37 +1449,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /* Consistency checks */
-
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
- opts->cpuset_clone_children || opts->release_agent ||
- opts->name) {
- pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
- } else {
- /*
- * If the 'all' option was specified select all the
- * subsystems, otherwise if 'none', 'name=' and a subsystem
- * name options were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So
- * all empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ return 0;
}
/*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
@@ -1399,7 +1483,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
@@ -1414,8 +1497,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
@@ -1434,11 +1517,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1527,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1537,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1563,6 +1645,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
struct css_set *cset;
int i, ret;
@@ -1573,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
goto out;
root_cgrp->id = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
if (ret)
goto out;
@@ -1600,7 +1684,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
if (ret)
goto destroy_root;
@@ -1638,7 +1727,7 @@ destroy_root:
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
- percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+ percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
@@ -1672,7 +1761,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
/* look for a matching existing root */
- if (!opts.subsys_mask && !opts.none && !opts.name) {
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
@@ -1730,15 +1819,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto out_unlock;
- } else {
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
* We want to reuse @root whose lifetime is governed by its
@@ -1827,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
*
* And don't kill the default root.
*/
- if (css_has_online_children(&root->cgrp.self) ||
+ if (!list_empty(&root->cgrp.self.children) ||
root == &cgrp_dfl_root)
cgroup_put(&root->cgrp);
else
@@ -1993,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
*/
- set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set_locked(old_cset, false);
+ put_css_set_locked(old_cset);
}
/**
@@ -2015,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset, false);
+ put_css_set_locked(cset);
}
up_write(&css_set_rwsem);
}
@@ -2109,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset, false);
- put_css_set(dst_cset, false);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
continue;
}
@@ -2119,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
- put_css_set(dst_cset, false);
+ put_css_set(dst_cset);
}
list_splice_tail(&csets, preloaded_csets);
@@ -2457,9 +2538,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_puts(seq, "0\n");
return 0;
}
@@ -2496,7 +2575,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
return 0;
}
@@ -2505,7 +2584,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
@@ -2611,6 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2650,50 +2730,27 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
- if (cgrp->child_subsys_mask & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
- /*
- * Because css offlining is asynchronous, userland
- * might try to re-enable the same controller while
- * the previous instance is still around. In such
- * cases, wait till it's gone using offline_waitq.
- */
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
-
/* unavailable or not enabled on the parent? */
if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
(cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
ret = -ENOENT;
goto out_unlock;
}
} else if (disable & (1 << ssid)) {
- if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
- if (child->child_subsys_mask & (1 << ssid)) {
+ if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2707,7 +2764,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Except for the root, child_subsys_mask must be zero for a cgroup
+ * Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2773,122 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Create csses for enables and update child_subsys_mask. This
- * changes cgroup_e_css() results which in turn makes the
- * subsequent cgroup_update_dfl_csses() associate all tasks in the
- * subtree to the updated csses.
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ old_sc = cgrp->subtree_control;
+ old_ss = cgrp->child_subsys_mask;
+ new_sc = (old_sc | enable) & ~disable;
+ new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
+
+ css_enable = ~old_ss & new_ss;
+ css_disable = old_ss & ~new_ss;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Because css offlining is asynchronous, userland might try to
+ * re-enable the same controller while the previous instance is
+ * still around. In such cases, wait till it's gone using
+ * offline_waitq.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(css_enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+ }
+
+ cgrp->subtree_control = new_sc;
+ cgrp->child_subsys_mask = new_ss;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
- ret = create_css(child, ss);
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
if (ret)
goto err_undo_css;
}
}
- cgrp->child_subsys_mask |= enable;
- cgrp->child_subsys_mask &= ~disable;
-
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
- /* all tasks are now migrated away from the old csses, kill them */
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
- cgroup_for_each_live_child(child, cgrp)
- kill_css(cgroup_css(child, ss));
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
+ }
+
+ /*
+ * The effective csses of all the descendants (excluding @cgrp) may
+ * have changed. Subsystems can optionally subscribe to this event
+ * by implementing ->css_e_css_changed() which is invoked if any of
+ * the effective csses seen from the css's cgroup may have changed.
+ */
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+ struct cgroup_subsys_state *css;
+
+ if (!ss->css_e_css_changed || !this_css)
+ continue;
+
+ css_for_each_descendant_pre(css, this_css)
+ if (css != this_css)
+ ss->css_e_css_changed(css);
}
kernfs_activate(cgrp->kn);
@@ -2755,8 +2898,8 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->child_subsys_mask &= ~enable;
- cgrp->child_subsys_mask |= disable;
+ cgrp->subtree_control = old_sc;
+ cgrp->child_subsys_mask = old_ss;
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -2764,8 +2907,14 @@ err_undo_css:
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
- if (css)
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
}
}
goto out_unlock;
@@ -2878,9 +3027,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
/*
* This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * limited. Disallow on the default hierarchy.
*/
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return -EPERM;
/*
@@ -2928,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft), 0, cft->kf_ops, cft,
- NULL, false, key);
+ NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
@@ -2964,9 +3113,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
@@ -3024,6 +3173,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
@@ -3109,7 +3261,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -3135,6 +3287,49 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
}
/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ /*
+ * If legacy_flies_on_dfl, we want to show the legacy files on the
+ * dfl hierarchy but iff the target subsystem hasn't been updated
+ * for the dfl hierarchy yet.
+ */
+ if (!cgroup_legacy_files_on_dfl ||
+ ss->dfl_cftypes != ss->legacy_cftypes) {
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ }
+
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
@@ -3611,10 +3806,7 @@ static void *pidlist_allocate(int count)
static void pidlist_free(void *p)
{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- kfree(p);
+ kvfree(p);
}
/*
@@ -3699,8 +3891,9 @@ after:
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if
- * sane_behavior so that no such expectation exists in the new interface.
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3908,7 @@ static pid_t pid_fry(pid_t pid)
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
@@ -3818,7 +4011,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3827,7 +4020,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
@@ -4004,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
- return seq_printf(s, "%d\n", *(int *)v);
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
@@ -4016,7 +4210,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
@@ -4040,7 +4233,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
return 0;
}
-static struct cftype cgroup_base_files[] = {
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
@@ -4052,46 +4246,52 @@ static struct cftype cgroup_base_files[] = {
.mode = S_IRUGO | S_IWUSR,
},
{
- .name = "cgroup.clone_children",
- .flags = CFTYPE_INSANE,
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_root_controllers_show,
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
- .flags = CFTYPE_ONLY_ON_DFL,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
{
.name = "cgroup.populated",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_populated_show,
},
+ { } /* terminate */
+};
- /*
- * Historical crazy stuff. These don't have "cgroup." prefix and
- * don't exist if sane_behavior. If you're depending on these, be
- * prepared to be burned.
- */
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
{
.name = "tasks",
- .flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4302,12 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "notify_on_release",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
- .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
@@ -4173,19 +4372,26 @@ static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
- if (css->ss) {
+ percpu_ref_exit(&css->refcnt);
+
+ if (ss) {
/* css free path */
+ int id = css->id;
+
if (css->parent)
css_put(css->parent);
- css->ss->css_free(css);
+ ss->css_free(css);
+ cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
@@ -4231,11 +4437,22 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
- cgroup_idr_remove(&ss->css_idr, css->id);
+ cgroup_idr_replace(&ss->css_idr, NULL, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
} else {
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
}
mutex_unlock(&cgroup_mutex);
@@ -4314,12 +4531,14 @@ static void offline_css(struct cgroup_subsys_state *css)
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created.
- * Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4334,7 +4553,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
init_and_link_css(css, ss, cgrp);
- err = percpu_ref_init(&css->refcnt, css_release);
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
@@ -4343,9 +4562,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
goto err_free_percpu_ref;
css->id = err;
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
- if (err)
- goto err_free_id;
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4593,7 @@ err_list_del:
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
- percpu_ref_cancel_init(&css->refcnt);
+ percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
@@ -4385,8 +4606,14 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
+ struct cftype *base_files;
int ssid, ret;
+ /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+ */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
@@ -4399,7 +4626,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
goto out_unlock;
}
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
@@ -4455,14 +4682,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
if (ret)
goto out_destroy;
/* let's create and online css's */
for_each_subsys(ss, ssid) {
if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss);
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
if (ret)
goto out_destroy;
}
@@ -4470,10 +4703,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
/*
* On the default hierarchy, a child doesn't automatically inherit
- * child_subsys_mask from the parent. Each is configured manually.
+ * subtree_control from the parent. Each is configured manually.
*/
- if (!cgroup_on_dfl(cgrp))
- cgrp->child_subsys_mask = parent->child_subsys_mask;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
kernfs_activate(kn);
@@ -4483,7 +4718,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
out_free_id:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
- percpu_ref_cancel_init(&cgrp->self.refcnt);
+ percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock:
@@ -4621,19 +4856,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /* CSS_ONLINE is clear, remove from ->release_list for the last time */
- raw_spin_lock(&release_list_lock);
- if (!list_empty(&cgrp->release_list))
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
-
/*
* Remove @cgrp directory along with the base files. @cgrp has an
* extra ref on its kn.
*/
kernfs_remove(cgrp->kn);
- set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
check_for_release(cgroup_parent(cgrp));
/* put the base reference */
@@ -4650,23 +4878,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
- cgroup_get(cgrp); /* for @kn->priv clearing */
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
-
- /*
- * There are two control paths which try to determine cgroup from
- * dentry without going through kernfs - cgroupstats_build() and
- * css_tryget_online_from_dir(). Those are supported by RCU
- * protecting clearing of cgrp->kn->priv backpointer, which should
- * happen after all files under it have been removed.
- */
- if (!ret)
- RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
- cgroup_put(cgrp);
return ret;
}
@@ -4736,8 +4951,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts =
- { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
@@ -4775,7 +4989,8 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex);
@@ -4807,10 +5022,26 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (!ss->disabled) {
- cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
}
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4857,12 +5088,9 @@ core_initcall(cgroup_wq_init);
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
-
-/* TODO: Use a proper seq_file iterator */
-int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *path;
int retval;
struct cgroup_root *root;
@@ -4872,14 +5100,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
- retval = 0;
-
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
@@ -4909,11 +5129,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
}
+ retval = 0;
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- put_task_struct(tsk);
-out_free:
kfree(buf);
out:
return retval;
@@ -4984,7 +5203,7 @@ void cgroup_post_fork(struct task_struct *child)
int i;
/*
- * This may race against cgroup_enable_task_cg_links(). As that
+ * This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
@@ -4999,7 +5218,7 @@ void cgroup_post_fork(struct task_struct *child)
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
- * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
@@ -5083,30 +5302,14 @@ void cgroup_exit(struct task_struct *tsk)
}
if (put_cset)
- put_css_set(cset, true);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
- !css_has_online_children(&cgrp->self)) {
- /*
- * Control Group is currently removeable. If it's not
- * already queued for a userspace notification, queue
- * it now
- */
- int need_schedule_work = 0;
-
- raw_spin_lock(&release_list_lock);
- if (!cgroup_is_dead(cgrp) &&
- list_empty(&cgrp->release_list)) {
- list_add(&cgrp->release_list, &release_list);
- need_schedule_work = 1;
- }
- raw_spin_unlock(&release_list_lock);
- if (need_schedule_work)
- schedule_work(&release_agent_work);
- }
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
}
/*
@@ -5134,52 +5337,39 @@ static void check_for_release(struct cgroup *cgrp)
*/
static void cgroup_release_agent(struct work_struct *work)
{
- BUG_ON(work != &release_agent_work);
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
- raw_spin_lock(&release_list_lock);
- while (!list_empty(&release_list)) {
- char *argv[3], *envp[3];
- int i;
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- struct cgroup *cgrp = list_entry(release_list.next,
- struct cgroup,
- release_list);
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathbuf)
- goto continue_free;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto continue_free;
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!agentbuf)
- goto continue_free;
-
- i = 0;
- argv[i++] = agentbuf;
- argv[i++] = path;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- /* Drop the lock while we invoke the usermode helper,
- * since the exec could involve hitting disk and hence
- * be a slow process */
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- mutex_lock(&cgroup_mutex);
- continue_free:
- kfree(pathbuf);
- kfree(agentbuf);
- raw_spin_lock(&release_list_lock);
- }
- raw_spin_unlock(&release_list_lock);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
}
static int __init cgroup_disable(char *str)
@@ -5205,6 +5395,14 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5231,7 +5429,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
- * protected for this access. See cgroup_rmdir() for details.
+ * protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
@@ -5255,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return idr_find(&ss->css_idr, id);
+ return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
#ifdef CONFIG_CGROUP_DEBUG
@@ -5359,7 +5557,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
@@ -5399,6 +5598,6 @@ static struct cftype debug_files[] = {
struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .base_cftypes = debug_files,
+ .legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
.css_free = freezer_css_free,
.attach = freezer_attach,
.fork = freezer_fork,
- .base_cftypes = files,
+ .legacy_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f442f8..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
ret = hrtimer_nanosleep_restart(restart);
set_fs(oldfs);
- if (ret) {
+ if (ret == -ERESTART_RESTARTBLOCK) {
rmtp = restart->nanosleep.compat_rmtp;
if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,9 +256,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
- if (ret) {
- struct restart_block *restart
- = &current_thread_info()->restart_block;
+ /*
+ * hrtimer_nanosleep() can only return 0 or
+ * -ERESTART_RESTARTBLOCK here because:
+ *
+ * - we call it with HRTIMER_MODE_REL and therefor exclude the
+ * -ERESTARTNOHAND return path.
+ *
+ * - we supply the rmtp argument from the task stack (due to
+ * the necessary compat conversion. So the update cannot
+ * fail, which excludes the -EFAULT return path as well. If
+ * it fails nevertheless we have a bigger problem and wont
+ * reach this place anymore.
+ *
+ * - if the return value is 0, we do not have to update rmtp
+ * because there is no remaining time.
+ *
+ * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+ * core implementation decides to return random nonsense.
+ */
+ if (ret == -ERESTART_RESTARTBLOCK) {
+ struct restart_block *restart = &current->restart_block;
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
@@ -266,7 +284,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
-
return ret;
}
@@ -842,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = compat_clock_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
}
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..72d59a1a6eb6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu)
}
/**
- * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- * enter userspace mode.
+ * context_tracking_enter - Inform the context tracking that the CPU is going
+ * enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
- * to userspace, when it's guaranteed the remaining kernel instructions
- * to execute won't use any RCU read side critical section because this
- * function sets RCU in extended quiescent state.
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
*/
-void context_tracking_user_enter(void)
+void context_tracking_enter(enum ctx_state state)
{
unsigned long flags;
@@ -75,9 +75,8 @@ void context_tracking_user_enter(void)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
- if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if ( __this_cpu_read(context_tracking.state) != state) {
if (__this_cpu_read(context_tracking.active)) {
- trace_user_enter(0);
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
@@ -85,7 +84,10 @@ void context_tracking_user_enter(void)
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- vtime_user_enter(current);
+ if (state == CONTEXT_USER) {
+ trace_user_enter(0);
+ vtime_user_enter(current);
+ }
rcu_user_enter();
}
/*
@@ -101,64 +103,32 @@ void context_tracking_user_enter(void)
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
- __this_cpu_write(context_tracking.state, IN_USER);
+ __this_cpu_write(context_tracking.state, state);
}
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+void context_tracking_user_enter(void)
{
- enum ctx_state prev_ctx;
-
- if (likely(!preemptible()))
- return;
-
- /*
- * Need to disable preemption in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- preempt_disable_notrace();
- prev_ctx = exception_enter();
- preempt_enable_no_resched_notrace();
-
- preempt_schedule();
-
- preempt_disable_notrace();
- exception_exit(prev_ctx);
- preempt_enable_notrace();
+ context_tracking_enter(CONTEXT_USER);
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
+NOKPROBE_SYMBOL(context_tracking_user_enter);
/**
- * context_tracking_user_exit - Inform the context tracking that the CPU is
- * exiting userspace mode and entering the kernel.
+ * context_tracking_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
- * This function must be called after we entered the kernel from userspace
- * before any use of RCU read side critical section. This potentially include
- * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ * This function must be called after we entered the kernel from user or
+ * guest space before any use of RCU read side critical section. This
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
*
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void context_tracking_user_exit(void)
+void context_tracking_exit(enum ctx_state state)
{
unsigned long flags;
@@ -169,20 +139,29 @@ void context_tracking_user_exit(void)
return;
local_irq_save(flags);
- if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (__this_cpu_read(context_tracking.state) == state) {
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
- vtime_user_exit(current);
- trace_user_exit(0);
+ if (state == CONTEXT_USER) {
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
}
- __this_cpu_write(context_tracking.state, IN_KERNEL);
+ __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
+
+void context_tracking_user_exit(void)
+{
+ context_tracking_exit(CONTEXT_USER);
+}
NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <linux/tick.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -58,20 +59,23 @@ static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
+ /* wait queue to wake up the active_writer */
+ wait_queue_head_t wq;
+ /* verifies that no writer will get active while readers are active */
+ struct mutex lock;
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
- int refcount;
+ atomic_t refcount;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} cpu_hotplug = {
.active_writer = NULL,
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- .refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
.dep_map = {.name = "cpu_hotplug.lock" },
#endif
@@ -79,9 +83,12 @@ static struct {
/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire_tryread() \
+ lock_map_acquire_tryread(&cpu_hotplug.dep_map)
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
void get_online_cpus(void)
{
might_sleep();
@@ -89,24 +96,38 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
- cpu_hotplug.refcount++;
+ atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
-
}
EXPORT_SYMBOL_GPL(get_online_cpus);
+bool try_get_online_cpus(void)
+{
+ if (cpu_hotplug.active_writer == current)
+ return true;
+ if (!mutex_trylock(&cpu_hotplug.lock))
+ return false;
+ cpuhp_lock_acquire_tryread();
+ atomic_inc(&cpu_hotplug.refcount);
+ mutex_unlock(&cpu_hotplug.lock);
+ return true;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
+
void put_online_cpus(void)
{
+ int refcount;
+
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
- if (WARN_ON(!cpu_hotplug.refcount))
- cpu_hotplug.refcount++; /* try to fix things up */
+ refcount = atomic_dec_return(&cpu_hotplug.refcount);
+ if (WARN_ON(refcount < 0)) /* try to fix things up */
+ atomic_inc(&cpu_hotplug.refcount);
+
+ if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+ wake_up(&cpu_hotplug.wq);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);
cpuhp_lock_release();
}
@@ -136,17 +157,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
*/
void cpu_hotplug_begin(void)
{
- cpu_hotplug.active_writer = current;
+ DEFINE_WAIT(wait);
+ cpu_hotplug.active_writer = current;
cpuhp_lock_acquire();
+
for (;;) {
mutex_lock(&cpu_hotplug.lock);
- if (likely(!cpu_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (likely(!atomic_read(&cpu_hotplug.refcount)))
+ break;
mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+ finish_wait(&cpu_hotplug.wq, &wait);
}
void cpu_hotplug_done(void)
@@ -274,21 +298,28 @@ void clear_tasks_mm_cpumask(int cpu)
rcu_read_unlock();
}
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
{
- struct task_struct *p;
- cputime_t utime, stime;
+ struct task_struct *g, *p;
- write_lock_irq(&tasklist_lock);
- for_each_process(p) {
- task_cputime(p, &utime, &stime);
- if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (utime || stime))
- pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
- p->comm, task_pid_nr(p), cpu,
- p->state, p->flags);
- }
- write_unlock_irq(&tasklist_lock);
+ read_lock_irq(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!p->on_rq)
+ continue;
+ /*
+ * We do the check with unlocked task_rq(p)->lock.
+ * Order the reading to do not warn about a task,
+ * which was running on this cpu in the past, and
+ * it's just been woken on another cpu.
+ */
+ rmb();
+ if (task_cpu(p) != dead_cpu)
+ continue;
+
+ pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+ p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+ } while_each_thread(g, p);
+ read_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
@@ -308,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Give up timekeeping duties */
+ tick_handover_do_timer();
/* Park the stopper thread */
kthread_park(current);
return 0;
@@ -378,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
*
* Wait for the stop thread to go away.
*/
- while (!idle_cpu(cpu))
+ while (!per_cpu(cpu_dead_idle, cpu))
cpu_relax();
+ smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+ per_cpu(cpu_dead_idle, cpu) = false;
+ hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
+ tick_cleanup_dead_cpu(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
@@ -416,6 +453,37 @@ out:
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_ONLINE:
+ smpboot_unpark_threads(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block smpboot_thread_notifier = {
+ .notifier_call = smpboot_thread_call,
+ .priority = CPU_PRI_SMPBOOT,
+};
+
+void __cpuinit smpboot_thread_init(void)
+{
+ register_cpu_notifier(&smpboot_thread_notifier);
+}
+
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
@@ -455,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- /* Wake the per cpu threads */
- smpboot_unpark_threads(cpu);
-
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..ee14e3a35a29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
/*
* This is old Memory Nodes tasks took on.
@@ -222,34 +248,34 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * and callback_mutex. The latter may nest inside the former. We also
- * require taking task_lock() when dereferencing a task's cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
*
- * A task must hold both mutexes to modify cpusets. If a task holds
+ * A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -258,7 +284,7 @@ static struct cpuset top_cpuset = {
*/
static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
/*
* CPU / memory hotplug is handled asynchronously.
@@ -303,13 +329,13 @@ static struct file_system_type cpuset_fs_type = {
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
/*
@@ -321,31 +347,32 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
{
if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
+ task_set_spread_page(tsk);
else
- tsk->flags &= ~PF_SPREAD_PAGE;
+ task_clear_spread_page(tsk);
+
if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
+ task_set_spread_slab(tsk);
else
- tsk->flags &= ~PF_SPREAD_SLAB;
+ task_clear_spread_slab(tsk);
}
/*
@@ -376,13 +403,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
- kfree(trial);
- return NULL;
- }
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+free_cs:
+ kfree(trial);
+ return NULL;
}
/**
@@ -391,6 +425,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static void free_trial_cpuset(struct cpuset *trial)
{
+ free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
@@ -436,9 +471,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
- /* We must be a subset of our parent cpuset */
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!is_cpuset_subset(trial, par))
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -471,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
goto out;
}
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) &&
+ !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+ trial->cpus_allowed))
+ goto out;
+
ret = 0;
out:
rcu_read_unlock();
@@ -480,11 +525,11 @@ out:
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
@@ -503,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs)
- continue;
-
/* skip the whole subtree if @cp doesn't have any CPU */
if (cpumask_empty(cp->cpus_allowed)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -580,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -589,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
+ if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+ goto done;
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -601,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ non_isolated_cpus);
goto done;
}
@@ -624,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
* the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- !is_sched_load_balance(cp))
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
continue;
if (is_sched_load_balance(cp))
@@ -705,7 +754,8 @@ restart:
struct cpuset *b = csa[j];
if (apn == b->pn) {
- cpumask_or(dp, dp, b->cpus_allowed);
+ cpumask_or(dp, dp, b->effective_cpus);
+ cpumask_and(dp, dp, non_isolated_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -718,6 +768,7 @@ restart:
BUG_ON(nslot != ndoms);
done:
+ free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -757,7 +808,7 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -781,45 +832,6 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
}
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
- while (cpumask_empty(cs->cpus_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- * if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
- while (nodes_empty(cs->mems_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +842,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, &it);
while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
}
/*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
*
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
}
/**
@@ -889,7 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
- int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -908,7 +946,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
return -EINVAL;
}
@@ -920,16 +959,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- is_load_balanced = is_sched_load_balance(trialcs);
-
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
- update_tasks_cpumask_hier(cs, true);
-
- if (is_load_balanced)
- rebuild_sched_domains_locked();
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
return 0;
}
@@ -951,15 +986,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
- struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
}
@@ -1028,13 +1061,12 @@ static void *cpuset_being_rebound;
static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(mems_cs, &newmems);
+ guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1109,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
}
/*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
*
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!nodes_empty(cp->mems_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ spin_lock_irq(&callback_lock);
+ cp->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
update_tasks_nodemask(cp);
rcu_read_lock();
@@ -1123,7 +1171,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1156,8 +1204,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_MEMORY])) {
- retval = -EINVAL;
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
goto done;
}
}
@@ -1170,11 +1218,12 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
- update_tasks_nodemask_hier(cs, true);
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
done:
return retval;
}
@@ -1262,9 +1311,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1389,27 +1438,15 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
mutex_lock(&cpuset_mutex);
- /*
- * We allow to move tasks into an empty cpuset if sane_behavior
- * flag is set.
- */
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_sane_behavior(css->cgroup) &&
+ if (!cgroup_on_dfl(css->cgroup) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
cgroup_taskset_for_each(task, tset) {
- /*
- * Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
- * affinity and isolating such threads by their set of
- * allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for
- * success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
- */
- ret = -EINVAL;
- if (task->flags & PF_NO_SETAFFINITY)
+ ret = task_can_attach(task, cs->cpus_allowed);
+ if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
@@ -1452,8 +1489,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
struct task_struct *leader = cgroup_taskset_first(tset);
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
mutex_lock(&cpuset_mutex);
@@ -1461,9 +1496,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cpus_cs, cpus_attach);
+ guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, tset) {
/*
@@ -1480,11 +1515,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->effective_mems;
mm = get_task_mm(leader);
if (mm) {
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
@@ -1495,7 +1528,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* mm from.
*/
if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
@@ -1516,6 +1549,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1678,35 +1713,28 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
- ssize_t count;
- char *buf, *s;
int ret = 0;
- count = seq_get_buf(sf, &buf);
- s = buf;
-
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
- s += cpulist_scnprintf(s, count, cs->cpus_allowed);
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
break;
case FILE_MEMLIST:
- s += nodelist_scnprintf(s, count, cs->mems_allowed);
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
+ break;
+ case FILE_EFFECTIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
default:
ret = -EINVAL;
- goto out_unlock;
}
- if (s < buf + count - 1) {
- *s++ = '\n';
- seq_commit(sf, s - buf);
- } else {
- seq_commit(sf, -1);
- }
-out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
return ret;
}
@@ -1779,6 +1807,18 @@ static struct cftype files[] = {
},
{
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
@@ -1869,18 +1909,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
+
+free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1951,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
+ spin_lock_irq(&callback_lock);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ spin_unlock_irq(&callback_lock);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1928,10 +1983,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -1962,20 +2019,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ spin_lock_irq(&callback_lock);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ spin_unlock_irq(&callback_lock);
+ mutex_unlock(&cpuset_mutex);
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .base_cftypes = files,
- .early_init = 1,
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
};
/**
@@ -1990,9 +2067,13 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2116,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
}
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -2045,11 +2186,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- static cpumask_t off_cpus;
- static nodemask_t off_mems;
- bool is_empty;
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2064,51 +2204,20 @@ retry:
goto retry;
}
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
- /*
- * If sane_behavior flag is set, we need to update tasks' cpumask
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
- * call update_tasks_cpumask() if the cpuset becomes empty, as
- * the tasks in it will be migrated to an ancestor.
- */
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs);
-
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' nodemask
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
- * call update_tasks_nodemask() if the cpuset becomes empty, as
- * the tasks in it will be migratd to an ancestor.
- */
- if ((sane && nodes_empty(cs->mems_allowed)) ||
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs);
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
mutex_unlock(&cpuset_mutex);
-
- /*
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
- *
- * Otherwise move tasks to the nearest ancestor with execution
- * resources. This is full cgroup operation which will
- * also call back into cpuset. Should be done outside any lock.
- */
- if (!sane && is_empty)
- remove_tasks_in_empty_cpuset(cs);
}
/**
@@ -2132,6 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
mutex_lock(&cpuset_mutex);
@@ -2139,22 +2249,26 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
+ spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
+ spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -2228,6 +2342,9 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
@@ -2244,23 +2361,19 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- struct cpuset *cpus_cs;
+ unsigned long flags;
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- guarantee_online_cpus(cpus_cs, pmask);
+ guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- struct cpuset *cpus_cs;
-
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
rcu_read_unlock();
/*
@@ -2282,7 +2395,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
*/
}
-void cpuset_init_current_mems_allowed(void)
+void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
}
@@ -2299,15 +2412,14 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *mems_cs;
nodemask_t mask;
+ unsigned long flags;
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &mask);
+ guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -2326,7 +2438,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2337,44 +2449,28 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
- * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If @node is set in
+ * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
* unless the task has been OOM killed as is marked TIF_MEMDIE.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The
+ * Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
- * mutex.
+ * cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2391,20 +2487,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- * the code that might scan up ancestor cpusets and sleep.
*/
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ if (in_interrupt())
return 1;
- might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2420,55 +2511,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt. It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- return 0;
-}
-
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
@@ -2544,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
-#define CPUSET_NODELIST_LEN (256)
-
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @tsk: pointer to task_struct of some task.
@@ -2555,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- /* Statically allocated to prevent using excess stack. */
- static char cpuset_nodelist[CPUSET_NODELIST_LEN];
- static DEFINE_SPINLOCK(cpuset_buffer_lock);
struct cgroup *cgrp;
- spin_lock(&cpuset_buffer_lock);
rcu_read_lock();
cgrp = task_cs(tsk)->css.cgroup;
- nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
- tsk->mems_allowed);
pr_info("%s cpuset=", tsk->comm);
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+ pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
rcu_read_unlock();
- spin_unlock(&cpuset_buffer_lock);
}
/*
@@ -2617,10 +2661,9 @@ void __cpuset_memory_pressure_bump(void)
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
@@ -2630,24 +2673,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
if (!p)
- goto out_put_task;
+ goto out_free;
seq_puts(m, p);
seq_putc(m, '\n');
retval = 0;
-out_put_task:
- put_task_struct(tsk);
out_free:
kfree(buf);
out:
@@ -2658,10 +2693,8 @@ out:
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_puts(m, "Mems_allowed:\t");
- seq_nodemask(m, &task->mems_allowed);
- seq_puts(m, "\n");
- seq_puts(m, "Mems_allowed_list:\t");
- seq_nodemask_list(m, &task->mems_allowed);
- seq_puts(m, "\n");
+ seq_printf(m, "Mems_allowed:\t%*pb\n",
+ nodemask_pr_args(&task->mems_allowed));
+ seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+ nodemask_pr_args(&task->mems_allowed));
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
* it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
*/
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+EXPORT_SYMBOL_GPL(elfcorehdr_addr);
/*
* stores the size of elf header of crash image
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
static struct kmem_cache *cred_jar;
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
/*
* The initial credentials for the initial task
*/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..0874e2edd275 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
* version 2. This program is licensed "as is" without any warranty of any
* kind, whether express or implied.
*/
+
+#define pr_fmt(fmt) "KGDB: " fmt
+
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
return err;
err = kgdb_arch_remove_breakpoint(&tmp);
if (err)
- printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
- "memory destroyed at: %lx", addr);
+ pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
+ addr);
return err;
}
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
if (error) {
ret = error;
- printk(KERN_INFO "KGDB: BP install failed: %lx",
- kgdb_break[i].bpt_addr);
+ pr_info("BP install failed: %lx\n",
+ kgdb_break[i].bpt_addr);
continue;
}
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
continue;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error) {
- printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
- kgdb_break[i].bpt_addr);
+ pr_info("BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
ret = error;
}
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
goto setundefined;
error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
if (error)
- printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ pr_err("breakpoint remove failed: %lx\n",
kgdb_break[i].bpt_addr);
setundefined:
kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
if (print_wait) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+ pr_crit("waiting... or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+ pr_crit("Waiting for remote debugger\n");
#endif
}
return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
exception_level = 0;
kgdb_skipexception(ks->ex_vector, ks->linux_regs);
dbg_activate_sw_breakpoints();
- printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
- addr);
+ pr_crit("re-enter error: breakpoint removed %lx\n", addr);
WARN_ON_ONCE(1);
return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
panic("Recursive entry to debugger");
}
- printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+ pr_crit("re-enter exception: ALL breakpoints killed\n");
#ifdef CONFIG_KGDB_KDB
/* Allow kdb to debug itself one level */
return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
@@ -689,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
+ /*
+ * Avoid entering the debugger if we were triggered due to an oops
+ * but panic_timeout indicates the system should automatically
+ * reboot on panic. We don't want to get stuck waiting for input
+ * on such systems, especially if its "just" an oops.
+ */
+ if (signo != SIGTRAP && panic_timeout)
+ return 1;
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
@@ -795,15 +810,15 @@ static struct console kgdbcons = {
static void sysrq_handle_dbg(int key)
{
if (!dbg_io_ops) {
- printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ pr_crit("ERROR: No KGDB I/O module available\n");
return;
}
if (!kgdb_connected) {
#ifdef CONFIG_KGDB_KDB
if (!dbg_kdb_mode)
- printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+ pr_crit("KGDB or $3#33 for KDB\n");
#else
- printk(KERN_CRIT "Entering KGDB\n");
+ pr_crit("Entering KGDB\n");
#endif
}
@@ -821,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self,
unsigned long val,
void *data)
{
+ /*
+ * Avoid entering the debugger if we were triggered due to a panic
+ * We don't want to get stuck waiting for input from user in such case.
+ * panic_timeout indicates the system should automatically
+ * reboot on panic.
+ */
+ if (panic_timeout)
+ return NOTIFY_DONE;
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", (char *)data);
kgdb_breakpoint();
@@ -945,7 +969,7 @@ static void kgdb_initial_breakpoint(void)
{
kgdb_break_asap = 0;
- printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ pr_crit("Waiting for connection from remote gdb...\n");
kgdb_breakpoint();
}
@@ -964,8 +988,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
if (dbg_io_ops) {
spin_unlock(&kgdb_registration_lock);
- printk(KERN_ERR "kgdb: Another I/O driver is already "
- "registered with KGDB.\n");
+ pr_err("Another I/O driver is already registered with KGDB\n");
return -EBUSY;
}
@@ -981,8 +1004,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
- new_dbg_io_ops->name);
+ pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
@@ -1017,8 +1039,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
- printk(KERN_INFO
- "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 70a504601dc3..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
bp->bph_length = 1;
if ((argc + 1) != nextarg) {
- if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
bp->bp_type = BP_ACCESS_WATCHPOINT;
- else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
bp->bp_type = BP_WRITE_WATCHPOINT;
- else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
bp->bp_type = BP_HARDWARE_BREAKPOINT;
else
return KDB_ARGCOUNT;
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0, KDB_REPEAT_NONE);
-
- kdb_register_repeat("ss", kdb_ss, "",
- "Single Step", 1, KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+
+ kdb_register_flags("ss", kdb_ss, "",
+ "Single Step", 1,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..fc1ef736253c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -439,7 +439,7 @@ poll_again:
* substituted for %d, %x or %o in the prompt.
*/
-char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
-int vkdb_printf(const char *fmt, va_list ap)
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
int linecount;
@@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap)
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ /*
+ * This was a interactive search (using '/' at more
+ * prompt) and it has completed. Clear the flag.
+ */
+ kdb_grepping_flag = 0;
/*
* at this point the string is a full line and
* should be printed, up to the null.
@@ -691,19 +697,20 @@ kdb_printit:
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
+ cp = (char *) printk_skip_level(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
- gdbstub_msg_write(kdb_buffer, retlen);
+ gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen;
- cp = kdb_buffer;
+ len = retlen - (cp - kdb_buffer);
+ cp2 = cp;
while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
+ dbg_io_ops->write_char(*cp2);
+ cp2++;
}
}
while (c) {
- c->write(c, kdb_buffer, retlen);
+ c->write(c, cp, retlen - (cp - kdb_buffer));
touch_nmi_watchdog();
c = c->next;
}
@@ -711,7 +718,10 @@ kdb_printit:
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
- printk(KERN_INFO "%s", kdb_buffer);
+ if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+ printk("%s", kdb_buffer);
+ else
+ pr_info("%s", kdb_buffer);
}
if (KDB_STATE(PAGER)) {
@@ -794,11 +804,23 @@ kdb_printit:
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ kdb_printf("\r");
+ kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
+ kdbgetenv("SEARCHPROMPT") ?: "search> ");
+ *strchrnul(kdb_grep_string, '\n') = '\0';
+ kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
+ suspend_grep = 1; /* for this recursion */
} else if (buf1[0] && buf1[0] != '\n') {
/* user hit something other than enter */
suspend_grep = 1; /* for this recursion */
- kdb_printf("\nOnly 'q' or 'Q' are processed at more "
- "prompt, input ignored\n");
+ if (buf1[0] != '/')
+ kdb_printf(
+ "\nOnly 'q', 'Q' or '/' are processed at "
+ "more prompt, input ignored\n");
+ else
+ kdb_printf("\n'/' cannot be used during | "
+ "grep filtering, input ignored\n");
} else if (kdb_grepping_flag) {
/* user hit enter */
suspend_grep = 1; /* for this recursion */
@@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...)
int r;
va_start(ap, fmt);
- r = vkdb_printf(fmt, ap);
+ r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
va_end(ap);
return r;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760305ca..4121345498e0 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
*/
#include <linux/ctype.h>
+#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
@@ -42,8 +44,13 @@
#include <linux/slab.h>
#include "kdb_private.h"
-#define GREP_LEN 256
-char kdb_grep_string[GREP_LEN];
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
+
+char kdb_grep_string[KDB_GREP_STRLEN];
int kdb_grepping_flag;
EXPORT_SYMBOL(kdb_grepping_flag);
int kdb_grep_leading;
@@ -121,6 +128,7 @@ static kdbmsg_t kdbmsgs[] = {
KDBMSG(BADLENGTH, "Invalid length field"),
KDBMSG(NOBP, "No Breakpoint exists"),
KDBMSG(BADADDR, "Invalid address"),
+ KDBMSG(NOPERM, "Permission denied"),
};
#undef KDBMSG
@@ -188,6 +196,26 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+ bool no_args)
+{
+ /* permissions comes from userspace so needs massaging slightly */
+ permissions &= KDB_ENABLE_MASK;
+ permissions |= KDB_ENABLE_ALWAYS_SAFE;
+
+ /* some commands change group when launched with no arguments */
+ if (no_args)
+ permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+
+ flags |= KDB_ENABLE_ALL;
+
+ return permissions & flags;
+}
+
+/*
* kdbgetenv - This function will return the character string value of
* an environment variable.
* Parameters:
@@ -476,6 +504,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
kdb_symtab_t symtab;
/*
+ * If the enable flags prohibit both arbitrary memory access
+ * and flow control then there are no reasonable grounds to
+ * provide symbol lookup.
+ */
+ if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+ kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
* Process arguments which follow the following syntax:
*
* symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +678,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
if (!s->count)
s->usable = 0;
if (s->usable)
- kdb_register(s->name, kdb_exec_defcmd,
- s->usage, s->help, 0);
+ /* macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is
+ * safety checked individually.
+ */
+ kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+ s->help, 0,
+ KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
if (!s->usable)
@@ -827,7 +869,7 @@ static void parse_grep(const char *str)
len = strlen(cp);
if (!len)
return;
- if (len >= GREP_LEN) {
+ if (len >= KDB_GREP_STRLEN) {
kdb_printf("search string too long\n");
return;
}
@@ -872,13 +914,12 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep;
+ int i, escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
*/
cp = (char *)cmdstr;
- kdb_grepping_flag = check_grep = 0;
if (KDB_FLAG(CMD_INTERRUPT)) {
/* Previous command was interrupted, newline must not
@@ -1003,25 +1044,22 @@ int kdb_parse(const char *cmdstr)
if (i < kdb_max_commands) {
int result;
+
+ if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ return KDB_NOPERM;
+
KDB_STATE_SET(CMD);
result = (*tp->cmd_func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- switch (tp->cmd_repeat) {
- case KDB_REPEAT_NONE:
- argc = 0;
- if (argv[0])
- *(argv[0]) = '\0';
- break;
- case KDB_REPEAT_NO_ARGS:
- argc = 1;
- if (argv[1])
- *(argv[1]) = '\0';
- break;
- case KDB_REPEAT_WITH_ARGS:
- break;
- }
+
+ if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ return result;
+
+ argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ if (argv[argc])
+ *(argv[argc]) = '\0';
return result;
}
@@ -1207,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
instruction_pointer(regs));
- kdb_dumpregs(regs);
break;
case KDB_REASON_SSTEP:
case KDB_REASON_BREAK:
@@ -1241,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
kdb_nextline = 1;
KDB_STATE_CLEAR(SUPPRESS);
+ kdb_grepping_flag = 0;
+ /* ensure the old search does not leak into '/' commands */
+ kdb_grep_string[0] = '\0';
cmdbuf = cmd_cur;
*cmdbuf = '\0';
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
*/
static int kdb_sr(int argc, const char **argv)
{
+ bool check_mask =
+ !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
+
if (argc != 1)
return KDB_ARGCOUNT;
+
kdb_trap_printk++;
- __handle_sysrq(*argv[1], false);
+ __handle_sysrq(*argv[1], check_mask);
kdb_trap_printk--;
return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4ld ", module_refcount(mod));
+ kdb_printf("%4d ", module_refcount(mod));
#endif
if (mod->state == MODULE_STATE_GOING)
kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
return 0;
if (!kt->cmd_name)
continue;
+ if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ continue;
if (strlen(kt->cmd_usage) > 20)
space = "\n ";
kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2472,7 +2520,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
static void kdb_sysinfo(struct sysinfo *val)
{
struct timespec uptime;
- do_posix_clock_monotonic_gettime(&uptime);
+ ktime_get_ts(&uptime);
memset(val, 0, sizeof(*val));
val->uptime = uptime.tv_sec;
val->loads[0] = avenrun[0];
@@ -2535,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv)
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
"Buffers: %8lu kB\n",
- val.totalram, val.freeram, val.bufferram);
+ K(val.totalram), K(val.freeram), K(val.bufferram));
return 0;
}
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
}
/*
- * kdb_register_repeat - This function is used to register a kernel
+ * kdb_register_flags - This function is used to register a kernel
* debugger command.
* Inputs:
* cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
* zero for success, one if a duplicate command.
*/
#define kdb_command_extend 50 /* arbitrary */
-int kdb_register_repeat(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_repeat_t repeat)
+int kdb_register_flags(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_cmdflags_t flags)
{
int i;
kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
kp->cmd_func = func;
kp->cmd_usage = usage;
kp->cmd_help = help;
- kp->cmd_flags = 0;
kp->cmd_minlen = minlen;
- kp->cmd_repeat = repeat;
+ kp->cmd_flags = flags;
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_repeat);
+EXPORT_SYMBOL_GPL(kdb_register_flags);
/*
* kdb_register - Compatibility register function for commands that do
* not need to specify a repeat state. Equivalent to
- * kdb_register_repeat with KDB_REPEAT_NONE.
+ * kdb_register_flags with flags set to 0.
* Inputs:
* cmd Command name
* func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
char *help,
short minlen)
{
- return kdb_register_repeat(cmd, func, usage, help, minlen,
- KDB_REPEAT_NONE);
+ return kdb_register_flags(cmd, func, usage, help, minlen, 0);
}
EXPORT_SYMBOL_GPL(kdb_register);
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
for_each_kdbcmd(kp, i)
kp->cmd_name = NULL;
- kdb_register_repeat("md", kdb_md, "<vaddr>",
+ kdb_register_flags("md", kdb_md, "<vaddr>",
"Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("rd", kdb_rd, "",
- "Display Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0,
+ KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1,
+ KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("rd", kdb_rd, "",
+ "Display Registers", 0,
+ KDB_ENABLE_REG_READ);
+ kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0,
+ KDB_ENABLE_REG_WRITE);
+ kdb_register_flags("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1,
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
- KDB_REPEAT_NONE);
- kdb_register_repeat("env", kdb_env, "",
- "Show environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("set", kdb_set, "",
- "Set environment variables", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("help", kdb_help, "",
- "Display Help Message", 1, KDB_REPEAT_NONE);
- kdb_register_repeat("?", kdb_help, "",
- "Display Help Message", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("env", kdb_env, "",
+ "Show environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("set", kdb_set, "",
+ "Set environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("help", kdb_help, "",
+ "Display Help Message", 1,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("?", kdb_help, "",
+ "Display Help Message", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0,
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, 0);
+ kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0,
+ KDB_ENABLE_REBOOT);
#if defined(CONFIG_MODULES)
- kdb_register_repeat("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0,
+ KDB_ENABLE_INSPECT);
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_repeat("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
#endif
if (arch_kgdb_ops.enable_nmi) {
- kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
- }
- kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("summary", kdb_summary, "",
- "Summarize the system", 4, KDB_REPEAT_NONE);
- kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3, KDB_REPEAT_NONE);
- kdb_register_repeat("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ }
+ kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0,
+ KDB_ENABLE_SIGNAL);
+ kdb_register_flags("summary", kdb_summary, "",
+ "Summarize the system", 4,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..75014d7f4568 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
kdb_func_t cmd_func; /* Function to execute command */
char *cmd_usage; /* Usage String for this command */
char *cmd_help; /* Help message for this command */
- short cmd_flags; /* Parsing flags */
short cmd_minlen; /* Minimum legal # command
* chars required */
- kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
+ kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
} kdbtab_t;
extern int kdb_bt(int, const char **); /* KDB display back trace */
@@ -197,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
/* Miscellaneous functions and data areas */
extern int kdb_grepping_flag;
+#define KDB_GREPPING_FLAG_SEARCH 0x8000
extern char kdb_grep_string[];
+#define KDB_GREP_STRLEN 256
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
@@ -210,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
-extern char *kdb_getstr(char *, size_t, char *);
+extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
/* Defines for kdb_symbol_print */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b71e66d..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
}
/*
- * Start accounting for a delay statistic using
- * its starting timestamp (@start)
+ * Finish delay accounting for a statistic using its timestamps (@start),
+ * accumalator (@total) and @count
*/
-
-static inline void delayacct_start(struct timespec *start)
+static void delayacct_end(u64 *start, u64 *total, u32 *count)
{
- do_posix_clock_monotonic_gettime(start);
-}
-
-/*
- * Finish delay accounting for a statistic using
- * its timestamps (@start, @end), accumalator (@total) and @count
- */
-
-static void delayacct_end(struct timespec *start, struct timespec *end,
- u64 *total, u32 *count)
-{
- struct timespec ts;
- s64 ns;
+ s64 ns = ktime_get_ns() - *start;
unsigned long flags;
- do_posix_clock_monotonic_gettime(end);
- ts = timespec_sub(*end, *start);
- ns = timespec_to_ns(&ts);
- if (ns < 0)
- return;
-
- spin_lock_irqsave(&current->delays->lock, flags);
- *total += ns;
- (*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ if (ns > 0) {
+ spin_lock_irqsave(&current->delays->lock, flags);
+ *total += ns;
+ (*count)++;
+ spin_unlock_irqrestore(&current->delays->lock, flags);
+ }
}
void __delayacct_blkio_start(void)
{
- delayacct_start(&current->delays->blkio_start);
+ current->delays->blkio_start = ktime_get_ns();
}
void __delayacct_blkio_end(void)
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)
if (current->delays->flags & DELAYACCT_PF_SWAPIN)
/* Swapin block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->swapin_delay,
&current->delays->swapin_count);
else /* Other block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->blkio_delay,
&current->delays->blkio_count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- s64 tmp;
- unsigned long t1;
- unsigned long long t2, t3;
- unsigned long flags;
- struct timespec ts;
cputime_t utime, stime, stimescaled, utimescaled;
+ unsigned long long t2, t3;
+ unsigned long flags, t1;
+ s64 tmp;
- tmp = (s64)d->cpu_run_real_total;
task_cputime(tsk, &utime, &stime);
- cputime_to_timespec(utime + stime, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_run_real_total;
+ tmp += cputime_to_nsecs(utime + stime);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
- tmp = (s64)d->cpu_scaled_run_real_total;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
- cputime_to_timespec(utimescaled + stimescaled, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_scaled_run_real_total;
+ tmp += cputime_to_nsecs(utimescaled + stimescaled);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
void __delayacct_freepages_start(void)
{
- delayacct_start(&current->delays->freepages_start);
+ current->delays->freepages_start = ktime_get_ns();
}
void __delayacct_freepages_end(void)
{
delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_end,
&current->delays->freepages_delay,
&current->delays->freepages_count);
}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = -pg
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df8fbfe..d659487254d5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@ static void release_callchain_buffers(void)
struct callchain_cpus_entries *entries;
entries = callchain_cpus_entries;
- rcu_assign_pointer(callchain_cpus_entries, NULL);
+ RCU_INIT_POINTER(callchain_cpus_entries, NULL);
call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
@@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
int cpu;
struct callchain_cpus_entries *entries;
- *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+ *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
if (*rctx == -1)
return NULL;
@@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
static void
put_callchain_entry(int rctx)
{
- put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+ put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6b17ac1b0c2a..81aa3a4ece9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,18 +34,23 @@
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
-#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/mman.h>
+#include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include "internal.h"
#include <asm/irq_regs.h>
+static struct workqueue_struct *perf_wq;
+
struct remote_function_call {
struct task_struct *p;
int (*func)(void *info);
@@ -119,6 +124,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
return data.ret;
}
+#define EVENT_OWNER_KERNEL ((void *) -1)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+ return event->owner == EVENT_OWNER_KERNEL;
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -143,7 +155,7 @@ enum event_type_t {
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -239,7 +251,7 @@ static void perf_duration_warn(struct irq_work *w)
u64 avg_local_sample_len;
u64 local_samples_len;
- local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len = __this_cpu_read(running_sample_length);
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
printk_ratelimited(KERN_WARNING
@@ -261,10 +273,10 @@ void perf_sample_event_took(u64 sample_len_ns)
return;
/* decay the counter by 1 average sample */
- local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len = __this_cpu_read(running_sample_length);
local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
local_samples_len += sample_len_ns;
- __get_cpu_var(running_sample_length) = local_samples_len;
+ __this_cpu_write(running_sample_length, local_samples_len);
/*
* note: this will be biased artifically low until we have
@@ -317,6 +329,11 @@ static inline u64 perf_clock(void)
return local_clock();
}
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+ return event->clock();
+}
+
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
@@ -341,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
- u64 time;
- u64 timestamp;
-};
-
-struct perf_cgroup {
- struct cgroup_subsys_state css;
- struct perf_cgroup_info __percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
- return container_of(task_css(task, perf_event_cgrp_id),
- struct perf_cgroup, css);
-}
-
static inline bool
perf_cgroup_match(struct perf_event *event)
{
@@ -391,14 +382,9 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline void perf_put_cgroup(struct perf_event *event)
-{
- css_put(&event->cgrp->css);
-}
-
static inline void perf_detach_cgroup(struct perf_event *event)
{
- perf_put_cgroup(event);
+ css_put(&event->cgrp->css);
event->cgrp = NULL;
}
@@ -609,7 +595,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_dentry,
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
@@ -867,22 +853,32 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
*/
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- struct list_head *head = &__get_cpu_var(rotation_list);
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
- list_add(&cpuctx->rotation_list, head);
+ WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+ list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+ WARN_ON(!irqs_disabled());
+
+ WARN_ON(list_empty(&ctx->active_ctx_list));
+
+ list_del_init(&ctx->active_ctx_list);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -890,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_event_context *ctx;
+
+ ctx = container_of(head, struct perf_event_context, rcu_head);
+ kfree(ctx->task_ctx_data);
+ kfree(ctx);
+}
+
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
@@ -897,17 +902,105 @@ static void put_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
- kfree_rcu(ctx, rcu_head);
+ call_rcu(&ctx->rcu_head, free_ctx);
}
}
-static void unclone_ctx(struct perf_event_context *ctx)
+/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ * perf_remove_from_context();
+ * synchronize_rcu();
+ * perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ * task_struct::perf_event_mutex
+ * perf_event_context::mutex
+ * perf_event_context::lock
+ * perf_event::child_mutex;
+ * perf_event::mmap_mutex
+ * mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
- if (ctx->parent_ctx) {
- put_ctx(ctx->parent_ctx);
- ctx->parent_ctx = NULL;
+ struct perf_event_context *ctx;
+
+again:
+ rcu_read_lock();
+ ctx = ACCESS_ONCE(event->ctx);
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
}
+ rcu_read_unlock();
+
+ mutex_lock_nested(&ctx->mutex, nesting);
+ if (event->ctx != ctx) {
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+
+ return ctx;
+}
+
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+ return perf_event_ctx_lock_nested(event, 0);
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+}
+
+/*
+ * This must be done under the ctx->lock, such as to serialize against
+ * context_equiv(), therefore we cannot call put_ctx() since that might end up
+ * calling scheduler related locks and ctx->lock nests inside those.
+ */
+static __must_check struct perf_event_context *
+unclone_ctx(struct perf_event_context *ctx)
+{
+ struct perf_event_context *parent_ctx = ctx->parent_ctx;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (parent_ctx)
+ ctx->parent_ctx = NULL;
ctx->generation++;
+
+ return parent_ctx;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,12 +1229,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
- if (has_branch_stack(event))
- ctx->nr_branch_stack++;
-
list_add_rcu(&event->event_entry, &ctx->event_list);
- if (!ctx->nr_events)
- perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1260,6 +1348,8 @@ static void perf_group_attach(struct perf_event *event)
if (group_leader == event)
return;
+ WARN_ON_ONCE(group_leader->ctx != event->ctx);
+
if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
!is_software_event(event))
group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1281,6 +1371,10 @@ static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1301,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
- if (has_branch_stack(event))
- ctx->nr_branch_stack--;
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1365,6 +1456,8 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
+
+ WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
@@ -1374,6 +1467,45 @@ out:
perf_event__header_size(tmp);
}
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+ return event && !is_kernel_event(event) && !event->owner;
+}
+
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+ return is_orphaned_event(event->parent);
+}
+
+static void orphans_remove_work(struct work_struct *work);
+
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+ if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+ return;
+
+ if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+ get_ctx(ctx);
+ ctx->orphans_remove_sched = true;
+ }
+}
+
+static int __init perf_workqueue_init(void)
+{
+ perf_wq = create_singlethread_workqueue("perf");
+ WARN(!perf_wq, "failed to create perf workqueue\n");
+ return perf_wq ? 0 : -1;
+}
+
+core_initcall(perf_workqueue_init);
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -1388,6 +1520,10 @@ event_sched_out(struct perf_event *event,
{
u64 tstamp = perf_event_time(event);
u64 delta;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* An event which could not be activated because of
* filter mismatch still needs to have its timings
@@ -1417,12 +1553,16 @@ event_sched_out(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu--;
- ctx->nr_active--;
+ if (!--ctx->nr_active)
+ perf_event_ctx_deactivate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
perf_pmu_enable(event->pmu);
}
@@ -1505,8 +1645,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
if (!task) {
/*
- * Per cpu events are removed via an smp call and
- * the removal is always successful.
+ * Per cpu events are removed via an smp call. The removal can
+ * fail if the CPU is currently offline, but in that case we
+ * already called __perf_remove_from_context from
+ * perf_event_exit_cpu.
*/
cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
@@ -1523,6 +1665,11 @@ retry:
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -1590,7 +1737,7 @@ int __perf_event_disable(void *info)
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -1631,6 +1778,19 @@ retry:
}
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_disable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_disable);
static void perf_set_shadow_time(struct perf_event *event,
@@ -1671,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
static int
event_sched_in(struct perf_event *event,
@@ -1705,6 +1866,12 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
+ perf_set_shadow_time(event, ctx, tstamp);
+
+ perf_log_itrace_start(event);
+
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
@@ -1712,19 +1879,19 @@ event_sched_in(struct perf_event *event,
goto out;
}
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
- perf_set_shadow_time(event, ctx, tstamp);
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
- ctx->nr_active++;
+ if (!ctx->nr_active++)
+ perf_event_ctx_activate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
cpuctx->exclusive = 1;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
out:
perf_pmu_enable(event->pmu);
@@ -1966,6 +2133,11 @@ retry:
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -2086,7 +2258,7 @@ unlock:
* perf_event_for_each_child or perf_event_for_each as described
* for perf_event_disable.
*/
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -2142,9 +2314,21 @@ retry:
out:
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_enable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_enable);
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
* not supported on inherited events
@@ -2153,10 +2337,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
return -EINVAL;
atomic_add(refresh, &event->event_limit);
- perf_event_enable(event);
+ _perf_event_enable(event);
return 0;
}
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_refresh(event, refresh);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(perf_event_refresh);
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2199,6 +2398,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
+ lockdep_assert_held(&ctx1->lock);
+ lockdep_assert_held(&ctx2->lock);
+
/* Pinning disables the swap optimization */
if (ctx1->pin_count || ctx2->pin_count)
return 0;
@@ -2320,7 +2522,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
- if (!parent || !next_parent)
+ if (!parent && !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2344,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next->perf_event_ctxp[ctxn] = ctx;
ctx->task = next;
next_ctx->task = task;
+
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
@@ -2362,6 +2567,56 @@ unlock:
}
}
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+ this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+ this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (prev == next)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->sched_task) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2381,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
@@ -2389,7 +2647,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_out(task, next);
}
@@ -2537,70 +2795,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
-
- /*
- * Since these rotations are per-cpu, we need to ensure the
- * cpu-context we got scheduled on is actually rotating.
- */
- perf_pmu_rotate_start(ctx->pmu);
-}
-
-/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- /* no need to flush branch stack if not changing task */
- if (prev == task)
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- /*
- * check if the context has at least one
- * event using PERF_SAMPLE_BRANCH_STACK
- */
- if (cpuctx->ctx.nr_branch_stack > 0
- && pmu->flush_branch_stack) {
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
- perf_pmu_disable(pmu);
-
- pmu->flush_branch_stack();
-
- perf_pmu_enable(pmu);
-
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
- }
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
}
/*
@@ -2632,12 +2826,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
* to check if we have to switch in PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
- /* check for system-wide branch_stack events */
- if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
- perf_branch_stack_sched_in(prev, task);
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2830,25 +3023,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
list_rotate_left(&ctx->flexible_groups);
}
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
- int rotate = 0, remove = 1;
+ int rotate = 0;
if (cpuctx->ctx.nr_events) {
- remove = 0;
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
rotate = 1;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
- remove = 0;
if (ctx->nr_events != ctx->nr_active)
rotate = 1;
}
@@ -2872,8 +3058,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:
- if (remove)
- list_del_init(&cpuctx->rotation_list);
return rotate;
}
@@ -2891,9 +3075,8 @@ bool perf_event_can_stop_tick(void)
void perf_event_task_tick(void)
{
- struct list_head *head = &__get_cpu_var(rotation_list);
- struct perf_cpu_context *cpuctx, *tmp;
- struct perf_event_context *ctx;
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
+ struct perf_event_context *ctx, *tmp;
int throttled;
WARN_ON(!irqs_disabled());
@@ -2901,14 +3084,8 @@ void perf_event_task_tick(void)
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
- list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
- ctx = &cpuctx->ctx;
+ list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- ctx = cpuctx->task_ctx;
- if (ctx)
- perf_adjust_freq_unthr_context(ctx, throttled);
- }
}
static int event_enable_on_exec(struct perf_event *event,
@@ -2932,6 +3109,7 @@ static int event_enable_on_exec(struct perf_event *event,
*/
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
@@ -2963,7 +3141,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
* Unclone this context if we enabled any event.
*/
if (enabled)
- unclone_ctx(ctx);
+ clone_ctx = unclone_ctx(ctx);
raw_spin_unlock(&ctx->lock);
@@ -2973,6 +3151,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
}
void perf_event_exec(void)
@@ -3023,7 +3204,10 @@ static void __perf_event_read(void *info)
static inline u64 perf_event_count(struct perf_event *event)
{
- return local64_read(&event->count) + atomic64_read(&event->child_count);
+ if (event->pmu->count)
+ return event->pmu->count(event);
+
+ return __perf_event_count(event);
}
static u64 perf_event_read(struct perf_event *event)
@@ -3063,10 +3247,12 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
+ INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3122,12 +3308,15 @@ errout:
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+ struct perf_event *event)
{
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
+ void *task_ctx_data = NULL;
unsigned long flags;
int ctxn, err;
+ int cpu = event->cpu;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -3155,18 +3344,39 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
if (ctxn < 0)
goto errout;
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+ if (!task_ctx_data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ }
+
retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
- unclone_ctx(ctx);
+ clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
+
+ if (task_ctx_data && !ctx->task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
} else {
ctx = alloc_perf_context(pmu, task);
err = -ENOMEM;
if (!ctx)
goto errout;
+ if (task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
+
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -3193,13 +3403,16 @@ retry:
}
}
+ kfree(task_ctx_data);
return ctx;
errout:
+ kfree(task_ctx_data);
return ERR_PTR(err);
}
static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
@@ -3209,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
+ perf_event_free_bpf_prog(event);
kfree(event);
}
-static void ring_buffer_put(struct ring_buffer *rb);
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
@@ -3221,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;
- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
@@ -3252,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
unaccount_event_cpu(event, event->cpu);
}
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ * 1) cpu-wide events in the presence of per-task events,
+ * 2) per-task events in the presence of cpu-wide events,
+ * 3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return 0;
+
+ /*
+ * Prevent co-existence of per-task and cpu-wide events on the
+ * same exclusive pmu.
+ *
+ * Negative pmu::exclusive_cnt means there are cpu-wide
+ * events on this "exclusive" pmu, positive means there are
+ * per-task events.
+ *
+ * Since this is called in perf_event_alloc() path, event::ctx
+ * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+ * to mean "per-task event", because unlike other attach states it
+ * never gets cleared.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+ return -EBUSY;
+ } else {
+ if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void exclusive_event_destroy(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return;
+
+ /* see comment in exclusive_event_init() */
+ if (event->attach_state & PERF_ATTACH_TASK)
+ atomic_dec(&pmu->exclusive_cnt);
+ else
+ atomic_inc(&pmu->exclusive_cnt);
+}
+
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+ if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ (e1->cpu == e2->cpu ||
+ e1->cpu == -1 ||
+ e2->cpu == -1))
+ return true;
+ return false;
+}
+
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *iter_event;
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return true;
+
+ list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+ if (exclusive_event_match(iter_event, event))
+ return false;
+ }
+
+ return true;
+}
+
static void __free_event(struct perf_event *event)
{
if (!event->parent) {
@@ -3265,8 +3559,10 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
+ if (event->pmu) {
+ exclusive_event_destroy(event);
module_put(event->pmu->module);
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -3312,16 +3608,12 @@ static void free_event(struct perf_event *event)
}
/*
- * Called when the last reference to the file is gone.
+ * Remove user event from the owner task.
*/
-static void put_event(struct perf_event *event)
+static void perf_remove_from_owner(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
- if (!atomic_long_dec_and_test(&event->refcount))
- return;
-
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
/*
@@ -3342,7 +3634,16 @@ static void put_event(struct perf_event *event)
rcu_read_unlock();
if (owner) {
- mutex_lock(&owner->perf_event_mutex);
+ /*
+ * If we're here through perf_event_exit_task() we're already
+ * holding ctx->mutex which would be an inversion wrt. the
+ * normal lock order.
+ *
+ * However we can safely take this lock because its the child
+ * ctx->mutex.
+ */
+ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
/*
* We have to re-check the event->owner field, if it is cleared
* we raced with perf_event_exit_task(), acquiring the mutex
@@ -3354,8 +3655,21 @@ static void put_event(struct perf_event *event)
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
- WARN_ON_ONCE(ctx->parent_ctx);
/*
* There are two ways this annotation is useful:
*
@@ -3368,9 +3682,10 @@ static void put_event(struct perf_event *event)
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
*/
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+ WARN_ON_ONCE(ctx->parent_ctx);
perf_remove_from_context(event, true);
- mutex_unlock(&ctx->mutex);
+ perf_event_ctx_unlock(event, ctx);
_free_event(event);
}
@@ -3388,6 +3703,42 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+
+ ctx = container_of(work, struct perf_event_context,
+ orphans_remove.work);
+
+ mutex_lock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ struct perf_event *parent_event = event->parent;
+
+ if (!is_orphaned_child(event))
+ continue;
+
+ perf_remove_from_context(event, true);
+
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ free_event(event);
+ put_event(parent_event);
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ ctx->orphans_remove_sched = false;
+ raw_spin_unlock_irq(&ctx->lock);
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -3418,12 +3769,13 @@ static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
- int n = 0, size = 0, ret = -EFAULT;
struct perf_event_context *ctx = leader->ctx;
- u64 values[5];
+ int n = 0, size = 0, ret;
u64 count, enabled, running;
+ u64 values[5];
+
+ lockdep_assert_held(&ctx->mutex);
- mutex_lock(&ctx->mutex);
count = perf_event_read_value(leader, &enabled, &running);
values[n++] = 1 + leader->nr_siblings;
@@ -3438,7 +3790,7 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf, values, size))
- goto unlock;
+ return -EFAULT;
ret = size;
@@ -3452,14 +3804,11 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf + ret, values, size)) {
- ret = -EFAULT;
- goto unlock;
+ return -EFAULT;
}
ret += size;
}
-unlock:
- mutex_unlock(&ctx->mutex);
return ret;
}
@@ -3485,6 +3834,19 @@ static int perf_event_read_one(struct perf_event *event,
return n * sizeof(u64);
}
+static bool is_event_hup(struct perf_event *event)
+{
+ bool no_children;
+
+ if (event->state != PERF_EVENT_STATE_EXIT)
+ return false;
+
+ mutex_lock(&event->child_mutex);
+ no_children = list_empty(&event->child_list);
+ mutex_unlock(&event->child_mutex);
+ return no_children;
+}
+
/*
* Read the performance event - simple non blocking version for now
*/
@@ -3518,15 +3880,26 @@ static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = perf_read_hw(event, buf, count);
+ perf_event_ctx_unlock(event, ctx);
- return perf_read_hw(event, buf, count);
+ return ret;
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
- unsigned int events = POLL_HUP;
+ unsigned int events = POLLHUP;
+
+ poll_wait(file, &event->waitq, wait);
+
+ if (is_event_hup(event))
+ return events;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3537,13 +3910,10 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
if (rb)
events = atomic_xchg(&rb->poll, 0);
mutex_unlock(&event->mmap_mutex);
-
- poll_wait(file, &event->waitq, wait);
-
return events;
}
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
{
(void)perf_event_read(event);
local64_set(&event->count, 0);
@@ -3562,6 +3932,7 @@ static void perf_event_for_each_child(struct perf_event *event,
struct perf_event *child;
WARN_ON_ONCE(event->ctx->parent_ctx);
+
mutex_lock(&event->child_mutex);
func(event);
list_for_each_entry(child, &event->child_list, child_list)
@@ -3575,14 +3946,13 @@ static void perf_event_for_each(struct perf_event *event,
struct perf_event_context *ctx = event->ctx;
struct perf_event *sibling;
- WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
+ lockdep_assert_held(&ctx->mutex);
+
event = event->group_leader;
perf_event_for_each_child(event, func);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
perf_event_for_each_child(sibling, func);
- mutex_unlock(&ctx->mutex);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3651,26 +4021,26 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
- struct perf_event *event = file->private_data;
void (*func)(struct perf_event *);
u32 flags = arg;
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
- func = perf_event_enable;
+ func = _perf_event_enable;
break;
case PERF_EVENT_IOC_DISABLE:
- func = perf_event_disable;
+ func = _perf_event_disable;
break;
case PERF_EVENT_IOC_RESET:
- func = perf_event_reset;
+ func = _perf_event_reset;
break;
case PERF_EVENT_IOC_REFRESH:
- return perf_event_refresh(event, arg);
+ return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
@@ -3705,6 +4075,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
+ case PERF_EVENT_IOC_SET_BPF:
+ return perf_event_set_bpf_prog(event, arg);
+
default:
return -ENOTTY;
}
@@ -3717,13 +4090,50 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ long ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_ioctl(event, cmd, arg);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+ case _IOC_NR(PERF_EVENT_IOC_ID):
+ /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+ cmd &= ~IOCSIZE_MASK;
+ cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+ }
+ break;
+ }
+ return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
int perf_event_task_enable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_enable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_enable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -3731,11 +4141,15 @@ int perf_event_task_enable(void)
int perf_event_task_disable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_disable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_disable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -3780,12 +4194,15 @@ static void perf_event_init_userpage(struct perf_event *event)
/* Allow new userspace to detect that bit 0 is deprecated */
userpg->cap_bit0_is_deprecated = 1;
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+ userpg->data_offset = PAGE_SIZE;
+ userpg->data_size = perf_data_size(rb);
unlock:
rcu_read_unlock();
}
-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+ struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3835,7 +4252,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
- arch_perf_update_userpage(userpg, now);
+ arch_perf_update_userpage(event, userpg, now);
barrier();
++userpg->lock;
@@ -3946,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
rb_free(rb);
}
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
{
struct ring_buffer *rb;
@@ -3961,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
{
if (!atomic_dec_and_test(&rb->refcount))
return;
@@ -3977,6 +4394,12 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (vma->vm_pgoff)
+ atomic_inc(&event->rb->aux_mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}
/*
@@ -3996,6 +4419,23 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
+ /*
+ * rb->aux_mmap_count will always drop before rb->mmap_count and
+ * event->mmap_count, so it is ok to use event->mmap_mutex to
+ * serialize with perf_mmap here.
+ */
+ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+ rb_free_aux(rb);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4069,7 +4509,7 @@ out_put:
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close,
+ .close = perf_mmap_close, /* non mergable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
};
@@ -4080,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
unsigned long locked, lock_limit;
- struct ring_buffer *rb;
+ struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
- long user_extra, extra;
+ long user_extra = 0, extra = 0;
int ret = 0, flags = 0;
/*
@@ -4098,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
- nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ if (vma->vm_pgoff == 0) {
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+ } else {
+ /*
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
+ */
+ u64 aux_offset, aux_size;
+
+ if (!event->rb)
+ return -EINVAL;
+
+ nr_pages = vma_size / PAGE_SIZE;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
+
+ rb = event->rb;
+ if (!rb)
+ goto aux_unlock;
+
+ aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+ aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ goto aux_unlock;
+
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ goto aux_unlock;
+
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ goto aux_unlock;
+
+ if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+ goto aux_unlock;
+
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ goto aux_unlock;
+
+ if (!is_power_of_2(nr_pages))
+ goto aux_unlock;
+
+ if (!atomic_inc_not_zero(&rb->mmap_count))
+ goto aux_unlock;
+
+ if (rb_has_aux(rb)) {
+ atomic_inc(&rb->aux_mmap_count);
+ ret = 0;
+ goto unlock;
+ }
+
+ atomic_set(&rb->aux_mmap_count, 1);
+ user_extra = nr_pages;
+
+ goto accounting;
+ }
/*
* If we have rb pages ensure they're a power-of-two number, so we
@@ -4110,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_size != PAGE_SIZE * (1 + nr_pages))
return -EINVAL;
- if (vma->vm_pgoff != 0)
- return -EINVAL;
-
WARN_ON_ONCE(event->ctx->parent_ctx);
again:
mutex_lock(&event->mmap_mutex);
@@ -4136,6 +4632,8 @@ again:
}
user_extra = nr_pages + 1;
+
+accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
@@ -4145,7 +4643,6 @@ again:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
- extra = 0;
if (user_locked > user_lock_limit)
extra = user_locked - user_lock_limit;
@@ -4159,35 +4656,46 @@ again:
goto unlock;
}
- WARN_ON(event->rb);
+ WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
-
if (!rb) {
- ret = -ENOMEM;
- goto unlock;
- }
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, flags);
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_locked = extra;
- rb->mmap_user = get_current_user();
+ if (!rb) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
- ring_buffer_attach(event, rb);
+ ring_buffer_attach(event, rb);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+ } else {
+ ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+ event->attr.aux_watermark, flags);
+ if (!ret)
+ rb->aux_mmap_locked = extra;
+ }
unlock:
- if (!ret)
+ if (!ret) {
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->pinned_vm += extra;
+
atomic_inc(&event->mmap_count);
+ } else if (rb) {
+ atomic_dec(&rb->mmap_count);
+ }
+aux_unlock:
mutex_unlock(&event->mmap_mutex);
/*
@@ -4197,6 +4705,9 @@ unlock:
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;
}
@@ -4222,7 +4733,7 @@ static const struct file_operations perf_fops = {
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
- .compat_ioctl = perf_ioctl,
+ .compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
};
@@ -4248,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry)
{
struct perf_event *event = container_of(entry,
struct perf_event, pending);
+ int rctx;
+
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
if (event->pending_disable) {
event->pending_disable = 0;
@@ -4258,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry)
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
}
/*
@@ -4296,22 +4817,29 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
}
-static void perf_sample_regs_user(struct perf_regs_user *regs_user,
- struct pt_regs *regs)
+static void perf_sample_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
{
- if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
+ if (user_mode(regs)) {
+ regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- regs_user->abi = perf_reg_abi(current);
+ } else if (current->mm) {
+ perf_get_regs_user(regs_user, regs, regs_user_copy);
+ } else {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
}
}
+static void perf_sample_regs_intr(struct perf_regs *regs_intr,
+ struct pt_regs *regs)
+{
+ regs_intr->regs = regs;
+ regs_intr->abi = perf_reg_abi(current);
+}
+
+
/*
* Get remaining task size from user stack pointer.
*
@@ -4423,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_TIME)
- data->time = perf_clock();
+ data->time = perf_event_clock(event);
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
@@ -4693,6 +5221,23 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ u64 abi = data->regs_intr.abi;
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ perf_output_sample_regs(handle,
+ data->regs_intr.regs,
+ mask);
+ }
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -4758,12 +5303,14 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ perf_sample_regs_user(&data->regs_user, regs,
+ &data->regs_user_copy);
+
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_user(&data->regs_user, regs);
-
if (data->regs_user.regs) {
u64 mask = event->attr.sample_regs_user;
size += hweight64(mask) * sizeof(u64);
@@ -4779,15 +5326,11 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- struct perf_regs_user *uregs = &data->regs_user;
u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
- if (!uregs->abi)
- perf_sample_regs_user(uregs, regs);
-
stack_size = perf_sample_ustack_size(stack_size, header->size,
- uregs->regs);
+ data->regs_user.regs);
/*
* If there is something to dump, add space for the dump
@@ -4800,6 +5343,21 @@ void perf_prepare_sample(struct perf_event_header *header,
data->stack_user_size = stack_size;
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_intr(&data->regs_intr, regs);
+
+ if (data->regs_intr.regs) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -4971,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);
+ task_event->event_id.time = perf_event_clock(event);
+
perf_output_put(&handle, task_event->event_id);
perf_event__output_id_sample(event, &handle, &sample);
@@ -5004,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
/* .ppid */
/* .tid */
/* .ptid */
- .time = perf_clock(),
+ /* .time */
},
};
@@ -5266,6 +5826,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = (char *) vma->vm_ops->name(vma);
+ if (name)
+ goto cpy_name;
+ }
+
name = (char *)arch_vma_name(vma);
if (name)
goto cpy_name;
@@ -5353,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
perf_event_mmap_event(&mmap_event);
}
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 offset;
+ u64 size;
+ u64 flags;
+ } rec = {
+ .header = {
+ .type = PERF_RECORD_AUX,
+ .misc = 0,
+ .size = sizeof(rec),
+ },
+ .offset = head,
+ .size = size,
+ .flags = flags,
+ };
+ int ret;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
/*
* IRQ throttle logging
*/
@@ -5374,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = perf_clock(),
+ .time = perf_event_clock(event),
.id = primary_event_id(event),
.stream_id = event->id,
};
@@ -5394,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+static void perf_log_itrace_start(struct perf_event *event)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+ event->hw.itrace_started)
+ return;
+
+ event->hw.itrace_started = 1;
+
+ rec.header.type = PERF_RECORD_ITRACE_START;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.pid = perf_event_pid(event, current);
+ rec.tid = perf_event_tid(event, current);
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -5665,7 +6303,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct perf_event *event;
struct hlist_head *head;
@@ -5682,9 +6320,11 @@ end:
rcu_read_unlock();
}
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
int perf_swevent_get_recursion_context(void)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
return get_recursion_context(swhash->recursion);
}
@@ -5692,26 +6332,35 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
inline void perf_swevent_put_recursion_context(int rctx)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
put_recursion_context(swhash->recursion, rctx);
}
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
- int rctx;
- preempt_disable_notrace();
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
+ if (WARN_ON_ONCE(!regs))
return;
perf_sample_data_init(&data, addr, 0);
-
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
perf_swevent_put_recursion_context(rctx);
+fail:
preempt_enable_notrace();
}
@@ -5721,7 +6370,7 @@ static void perf_swevent_read(struct perf_event *event)
static int perf_swevent_add(struct perf_event *event, int flags)
{
- struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
@@ -5743,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
}
hlist_add_head_rcu(&event->hlist_entry, head);
+ perf_event_update_userpage(event);
return 0;
}
@@ -5777,7 +6427,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
if (!hlist)
return;
- rcu_assign_pointer(swhash->swevent_hlist, NULL);
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
kfree_rcu(hlist, rcu_head);
}
@@ -5903,22 +6553,17 @@ static int perf_swevent_init(struct perf_event *event)
return 0;
}
-static int perf_swevent_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = perf_swevent_init,
.add = perf_swevent_add,
.del = perf_swevent_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
#ifdef CONFIG_EVENT_TRACING
@@ -6036,8 +6681,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
static inline void perf_tp_register(void)
@@ -6068,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ /* bpf programs can only be attached to kprobes */
+ return -EINVAL;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
#else
static inline void perf_tp_register(void)
@@ -6083,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
#endif /* CONFIG_EVENT_TRACING */
#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6221,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
cpu_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
return 0;
}
@@ -6257,14 +6952,14 @@ static int cpu_clock_event_init(struct perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
.del = cpu_clock_event_del,
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
/*
@@ -6297,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
task_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
return 0;
}
@@ -6337,14 +7033,14 @@ static int task_clock_event_init(struct perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = task_clock_event_init,
.add = task_clock_event_add,
.del = task_clock_event_del,
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6374,7 +7070,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}
/*
@@ -6582,12 +7278,10 @@ skip_type:
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
__perf_cpu_hrtimer_init(cpuctx, cpu);
- INIT_LIST_HEAD(&cpuctx->rotation_list);
cpuctx->unique_pmu = pmu;
}
@@ -6618,6 +7312,7 @@ got_cpu_context:
pmu->event_idx = perf_event_idx_default;
list_add_rcu(&pmu->entry, &pmus);
+ atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
@@ -6660,6 +7355,31 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx = NULL;
+ int ret;
+
+ if (!try_module_get(pmu->module))
+ return -ENODEV;
+
+ if (event->group_leader != event) {
+ ctx = perf_event_ctx_lock(event->group_leader);
+ BUG_ON(!ctx);
+ }
+
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+
+ if (ctx)
+ perf_event_ctx_unlock(event->group_leader, ctx);
+
+ if (ret)
+ module_put(pmu->module);
+
+ return ret;
+}
+
struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
@@ -6672,24 +7392,14 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (ret)
pmu = ERR_PTR(ret);
goto unlock;
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -6710,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;
- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
@@ -6752,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct perf_event *group_leader,
struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler,
- void *context)
+ void *context, int cgroup_fd)
{
struct pmu *pmu;
struct perf_event *event;
@@ -6807,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) {
event->attach_state = PERF_ATTACH_TASK;
-
- if (attr->type == PERF_TYPE_TRACEPOINT)
- event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
- * hw_breakpoint is a bit difficult here..
+ * XXX pmu::event_init needs to know what task to account to
+ * and we cannot use the ctx information because we need the
+ * pmu before we get a ctx.
*/
- else if (attr->type == PERF_TYPE_BREAKPOINT)
- event->hw.bp_target = task;
-#endif
+ event->hw.target = task;
}
+ event->clock = &local_clock;
+ if (parent_event)
+ event->clock = parent_event->clock;
+
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
@@ -6845,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto err_ns;
+ if (!has_branch_stack(event))
+ event->attr.branch_sample_type = 0;
+
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_ns;
+ }
+
pmu = perf_init_event(event);
if (!pmu)
goto err_ns;
@@ -6853,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ err = exclusive_event_init(event);
+ if (err)
+ goto err_pmu;
+
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
if (err)
- goto err_pmu;
+ goto err_per_task;
}
}
return event;
+err_per_task:
+ exclusive_event_destroy(event);
+
err_pmu:
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
kfree(event);
@@ -6994,6 +7718,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = -EINVAL;
}
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ ret = perf_reg_validate(attr->sample_regs_intr);
out:
return ret;
@@ -7028,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
+ /*
+ * Mixing clocks in the same buffer is trouble you don't need.
+ */
+ if (output_event->clock != event->clock)
+ goto out;
+
+ /*
+ * If both events generate aux data, they must be on the same PMU
+ */
+ if (has_aux(event) && has_aux(output_event) &&
+ event->pmu != output_event->pmu)
+ goto out;
+
set:
mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
@@ -7051,6 +7790,52 @@ out:
return ret;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+ bool nmi_safe = false;
+
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ event->clock = &ktime_get_mono_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_MONOTONIC_RAW:
+ event->clock = &ktime_get_raw_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_REALTIME:
+ event->clock = &ktime_get_real_ns;
+ break;
+
+ case CLOCK_BOOTTIME:
+ event->clock = &ktime_get_boot_ns;
+ break;
+
+ case CLOCK_TAI:
+ event->clock = &ktime_get_tai_ns;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7066,7 +7851,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx, *uninitialized_var(gctx);
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -7075,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
int move_group = 0;
int err;
int f_flags = O_RDWR;
+ int cgroup_fd = -1;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
@@ -7140,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (flags & PERF_FLAG_PID_CGROUP)
+ cgroup_fd = pid;
+
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
- NULL, NULL);
+ NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err_cpus;
}
- if (flags & PERF_FLAG_PID_CGROUP) {
- err = perf_cgroup_connect(pid, event, &attr, group_leader);
- if (err) {
- __free_event(event);
- goto err_cpus;
- }
- }
-
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -ENOTSUPP;
@@ -7170,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = event->pmu;
+ if (attr.use_clockid) {
+ err = perf_event_set_clock(event, attr.clockid);
+ if (err)
+ goto err_alloc;
+ }
+
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
@@ -7196,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event->cpu);
+ ctx = find_get_context(pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
}
+ if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+ err = -EBUSY;
+ goto err_context;
+ }
+
if (task) {
put_task_struct(task);
task = NULL;
@@ -7219,12 +8011,29 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (group_leader->group_leader != group_leader)
goto err_context;
+
+ /* All events in a group should have the same clock */
+ if (group_leader->clock != event->clock)
+ goto err_context;
+
/*
* Do not allow to attach to a group in a different
* task or CPU context:
*/
if (move_group) {
- if (group_leader->ctx->type != ctx->type)
+ /*
+ * Make sure we're both on the same task, or both
+ * per-cpu events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
+ */
+ if (group_leader->cpu != event->cpu)
goto err_context;
} else {
if (group_leader->ctx != ctx)
@@ -7252,43 +8061,75 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- struct perf_event_context *gctx = group_leader->ctx;
-
- mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader, false);
+ gctx = group_leader->ctx;
/*
- * Removing from the context ends up with disabled
- * event. What we want here is event in the initial
- * startup state, ready to be add into new context.
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
*/
- perf_event__state_init(group_leader);
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ perf_remove_from_context(group_leader, false);
+
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling, false);
- perf_event__state_init(sibling);
put_ctx(gctx);
}
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
+ } else {
+ mutex_lock(&ctx->mutex);
}
WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
if (move_group) {
+ /*
+ * Wait for everybody to stop referencing the events through
+ * the old lists, before installing it on new lists.
+ */
synchronize_rcu();
- perf_install_in_context(ctx, group_leader, event->cpu);
- get_ctx(ctx);
+
+ /*
+ * Install the group siblings before the group leader.
+ *
+ * Because a group leader will try and install the entire group
+ * (through the sibling list, which is still in-tact), we can
+ * end up with siblings installed in the wrong context.
+ *
+ * By installing siblings first we NO-OP because they're not
+ * reachable through the group lists.
+ */
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, event->cpu);
+ perf_event__state_init(sibling);
+ perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
+ get_ctx(ctx);
+ }
+
+ if (!exclusive_event_installable(event, ctx)) {
+ err = -EBUSY;
+ mutex_unlock(&ctx->mutex);
+ fput(event_file);
+ goto err_context;
}
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
+
+ if (move_group) {
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ }
mutex_unlock(&ctx->mutex);
put_online_cpus();
@@ -7354,15 +8195,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
*/
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
- overflow_handler, context);
+ overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err;
}
+ /* Mark owner so we could distinguish it from user events. */
+ event->owner = EVENT_OWNER_KERNEL;
+
account_event(event);
- ctx = find_get_context(event->pmu, task, cpu);
+ ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_free;
@@ -7370,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (!exclusive_event_installable(event, ctx)) {
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+ err = -EBUSY;
+ goto err_free;
+ }
+
perf_install_in_context(ctx, event, cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -7393,7 +8245,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
- mutex_lock(&src_ctx->mutex);
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
perf_remove_from_context(event, false);
@@ -7401,11 +8257,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
}
- mutex_unlock(&src_ctx->mutex);
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
synchronize_rcu();
- mutex_lock(&dst_ctx->mutex);
+ /*
+ * Re-instate events in 2 passes.
+ *
+ * Skip over group leaders and only install siblings on this first
+ * pass, siblings will not get enabled without a leader, however a
+ * leader will enable its siblings, even if those are still on the old
+ * context.
+ */
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ if (event->group_leader == event)
+ continue;
+
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+
+ /*
+ * Once all the siblings are setup properly, install the group leaders
+ * to make it go.
+ */
list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7415,6 +8296,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
get_ctx(dst_ctx);
}
mutex_unlock(&dst_ctx->mutex);
+ mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
@@ -7447,6 +8329,12 @@ static void sync_child_event(struct perf_event *child_event,
mutex_unlock(&parent_event->child_mutex);
/*
+ * Make sure user/parent get notified, that we just
+ * lost one event.
+ */
+ perf_event_wakeup(parent_event);
+
+ /*
* Release the parent event, if this was the last
* reference to it.
*/
@@ -7480,13 +8368,16 @@ __perf_event_exit_task(struct perf_event *child_event,
if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
+ } else {
+ child_event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(child_event);
}
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event, *next;
- struct perf_event_context *child_ctx, *parent_ctx;
+ struct perf_event_context *child_ctx, *clone_ctx = NULL;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7513,28 +8404,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
child->perf_event_ctxp[ctxn] = NULL;
/*
- * In order to avoid freeing: child_ctx->parent_ctx->task
- * under perf_event_context::lock, grab another reference.
- */
- parent_ctx = child_ctx->parent_ctx;
- if (parent_ctx)
- get_ctx(parent_ctx);
-
- /*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
* the events from it.
*/
- unclone_ctx(child_ctx);
+ clone_ctx = unclone_ctx(child_ctx);
update_context_time(child_ctx);
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
- /*
- * Now that we no longer hold perf_event_context::lock, drop
- * our extra child_ctx->parent_ctx reference.
- */
- if (parent_ctx)
- put_ctx(parent_ctx);
+ if (clone_ctx)
+ put_ctx(clone_ctx);
/*
* Report the task dead after unscheduling the events so that we
@@ -7604,14 +8483,19 @@ static void perf_free_event(struct perf_event *event,
put_event(parent);
+ raw_spin_lock_irq(&ctx->lock);
perf_group_detach(event);
list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
free_event(event);
}
/*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
* perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -7663,6 +8547,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
+ enum perf_event_active_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -7679,11 +8564,12 @@ inherit_event(struct perf_event *parent_event,
parent_event->cpu,
child,
group_leader, parent_event,
- NULL, NULL);
+ NULL, NULL, -1);
if (IS_ERR(child_event))
return child_event;
- if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ if (is_orphaned_event(parent_event) ||
+ !atomic_long_inc_not_zero(&parent_event->refcount)) {
free_event(child_event);
return NULL;
}
@@ -7695,7 +8581,7 @@ inherit_event(struct perf_event *parent_event,
* not its attr.disabled bit. We hold the parent's mutex,
* so we won't race with perf_event_{en, dis}able_family.
*/
- if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+ if (parent_state >= PERF_EVENT_STATE_INACTIVE)
child_event->state = PERF_EVENT_STATE_INACTIVE;
else
child_event->state = PERF_EVENT_STATE_OFF;
@@ -7804,7 +8690,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -7911,8 +8797,10 @@ int perf_event_init_task(struct task_struct *child)
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
- if (ret)
+ if (ret) {
+ perf_event_free_task(child);
return ret;
+ }
}
return 0;
@@ -7926,7 +8814,7 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+ INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
}
}
@@ -7947,22 +8835,11 @@ static void perf_event_init_cpu(int cpu)
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- WARN_ON(!irqs_disabled());
-
- list_del_init(&cpuctx->rotation_list);
-}
-
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = false };
+ struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
- perf_pmu_rotate_stop(ctx->pmu);
-
rcu_read_lock();
list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
__perf_remove_from_context(&re);
@@ -8073,6 +8950,18 @@ void __init perf_event_init(void)
!= 1024);
}
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s\n", pmu_attr->event_str);
+
+ return 0;
+}
+
static int __init perf_event_sysfs_init(void)
{
struct pmu *pmu;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.bp_target;
+ struct task_struct *tsk = bp->hw.target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.bp_target == tsk &&
+ if (iter->hw.target == tsk &&
find_slot_idx(iter) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
int nr;
nr = info->cpu_pinned;
- if (!bp->hw.bp_target)
+ if (!bp->hw.target)
nr += max_task_bp_pinned(cpu, type);
else
nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
weight = -weight;
/* Pinned counter cpu profiling */
- if (!bp->hw.bp_target) {
+ if (!bp->hw.target) {
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
bp->hw.state = PERF_HES_STOPPED;
}
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..9f6ce9ba4a04 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
+ long aux_watermark;
/* poll crap */
spinlock_t event_lock;
struct list_head event_list;
@@ -35,6 +36,20 @@ struct ring_buffer {
unsigned long mmap_locked;
struct user_struct *mmap_user;
+ /* AUX area */
+ local_t aux_head;
+ local_t aux_nest;
+ local_t aux_wakeup;
+ unsigned long aux_pgoff;
+ int aux_nr_pages;
+ int aux_overwrite;
+ atomic_t aux_mmap_count;
+ unsigned long aux_mmap_locked;
+ void (*free_aux)(void *);
+ atomic_t aux_refcount;
+ void **aux_pages;
+ void *aux_priv;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
+
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+ return !!rb->aux_nr_pages;
+}
+
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags);
extern void
perf_event_header__init_id(struct perf_event_header *header,
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+ return rb->aux_nr_pages << PAGE_SHIFT;
+}
+
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
static inline unsigned long \
func_name(struct perf_output_handle *handle, \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..232f00f273cb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/circ_buf.h>
+#include <linux/poll.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, POLL_IN);
+ atomic_set(&handle->rb->poll, POLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
@@ -242,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
spin_lock_init(&rb->event_lock);
}
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event)
+{
+ struct perf_event *output_event = event;
+ unsigned long aux_head, aux_tail;
+ struct ring_buffer *rb;
+
+ if (output_event->parent)
+ output_event = output_event->parent;
+
+ /*
+ * Since this will typically be open across pmu::add/pmu::del, we
+ * grab ring_buffer's refcount instead of holding rcu read lock
+ * to make sure it doesn't disappear under us.
+ */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
+ return NULL;
+
+ if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ goto err;
+
+ /*
+ * Nesting is not supported for AUX area, make sure nested
+ * writers are caught early
+ */
+ if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ goto err_put;
+
+ aux_head = local_read(&rb->aux_head);
+
+ handle->rb = rb;
+ handle->event = event;
+ handle->head = aux_head;
+ handle->size = 0;
+
+ /*
+ * In overwrite mode, AUX data stores do not depend on aux_tail,
+ * therefore (A) control dependency barrier does not exist. The
+ * (B) <-> (C) ordering is still observed by the pmu driver.
+ */
+ if (!rb->aux_overwrite) {
+ aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ if (aux_head - aux_tail < perf_aux_size(rb))
+ handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+
+ /*
+ * handle->size computation depends on aux_tail load; this forms a
+ * control dependency barrier separating aux_tail load from aux data
+ * store that will be enabled on successful return
+ */
+ if (!handle->size) { /* A, matches D */
+ event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ local_set(&rb->aux_nest, 0);
+ goto err_put;
+ }
+ }
+
+ return handle->rb->aux_priv;
+
+err_put:
+ rb_free_aux(rb);
+
+err:
+ ring_buffer_put(rb);
+ handle->event = NULL;
+
+ return NULL;
+}
+
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+ bool truncated)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+ u64 flags = 0;
+
+ if (truncated)
+ flags |= PERF_AUX_FLAG_TRUNCATED;
+
+ /* in overwrite mode, driver provides aux_head via handle */
+ if (rb->aux_overwrite) {
+ flags |= PERF_AUX_FLAG_OVERWRITE;
+
+ aux_head = handle->head;
+ local_set(&rb->aux_head, aux_head);
+ } else {
+ aux_head = local_read(&rb->aux_head);
+ local_add(size, &rb->aux_head);
+ }
+
+ if (size || flags) {
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ */
+
+ perf_event_aux_event(handle->event, aux_head, size, flags);
+ }
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ }
+ handle->event = NULL;
+
+ local_set(&rb->aux_nest, 0);
+ rb_free_aux(rb);
+ ring_buffer_put(rb);
+}
+
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+
+ if (size > handle->size)
+ return -ENOSPC;
+
+ local_add(size, &rb->aux_head);
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ handle->wakeup = local_read(&rb->aux_wakeup) +
+ rb->aux_watermark;
+ }
+
+ handle->head = aux_head;
+ handle->size -= size;
+
+ return 0;
+}
+
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+ /* this is only valid between perf_aux_output_begin and *_end */
+ if (!handle->event)
+ return NULL;
+
+ return handle->rb->aux_priv;
+}
+
+#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+ struct page *page;
+
+ if (order > MAX_ORDER)
+ order = MAX_ORDER;
+
+ do {
+ page = alloc_pages_node(node, PERF_AUX_GFP, order);
+ } while (!page && order--);
+
+ if (page && order) {
+ /*
+ * Communicate the allocation size to the driver
+ */
+ split_page(page, order);
+ SetPagePrivate(page);
+ set_page_private(page, order);
+ }
+
+ return page;
+}
+
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+ struct page *page = virt_to_page(rb->aux_pages[idx]);
+
+ ClearPagePrivate(page);
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+ bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+ int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+ int ret = -ENOMEM, max_order = 0;
+
+ if (!has_aux(event))
+ return -ENOTSUPP;
+
+ if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
+
+ /*
+ * PMU requests more than one contiguous chunks of memory
+ * for SW double buffering
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+ !overwrite) {
+ if (!max_order)
+ return -EINVAL;
+
+ max_order--;
+ }
+ }
+
+ rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ if (!rb->aux_pages)
+ return -ENOMEM;
+
+ rb->free_aux = event->pmu->free_aux;
+ for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+ struct page *page;
+ int last, order;
+
+ order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+ page = rb_alloc_aux_page(node, order);
+ if (!page)
+ goto out;
+
+ for (last = rb->aux_nr_pages + (1 << page_private(page));
+ last > rb->aux_nr_pages; rb->aux_nr_pages++)
+ rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+ }
+
+ rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ overwrite);
+ if (!rb->aux_priv)
+ goto out;
+
+ ret = 0;
+
+ /*
+ * aux_pages (and pmu driver's private data, aux_priv) will be
+ * referenced in both producer's and consumer's contexts, thus
+ * we keep a refcount here to make sure either of the two can
+ * reference them safely.
+ */
+ atomic_set(&rb->aux_refcount, 1);
+
+ rb->aux_overwrite = overwrite;
+ rb->aux_watermark = watermark;
+
+ if (!rb->aux_watermark && !rb->aux_overwrite)
+ rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+
+out:
+ if (!ret)
+ rb->aux_pgoff = pgoff;
+ else
+ rb_free_aux(rb);
+
+ return ret;
+}
+
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+}
+
+void rb_free_aux(struct ring_buffer *rb)
+{
+ if (atomic_dec_and_test(&rb->aux_refcount))
+ __rb_free_aux(rb);
+}
+
#ifndef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap() with regular GFP_KERNEL-0 pages.
*/
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
@@ -339,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
return rb->nr_pages << page_order(rb);
}
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
@@ -415,3 +719,19 @@ fail:
}
#endif
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (rb->aux_nr_pages) {
+ /* above AUX space */
+ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+ return NULL;
+
+ /* AUX space */
+ if (pgoff >= rb->aux_pgoff)
+ return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ }
+
+ return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mem_cgroup *memcg;
+
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ if (err)
+ return err;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
+ mem_cgroup_commit_charge(kpage, memcg, false);
+ lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
dec_mm_counter(mm, MM_FILEPAGES);
@@ -186,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -315,18 +323,11 @@ retry:
if (!new_page)
goto put_old;
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
- goto put_new;
-
__SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page);
- if (ret)
- mem_cgroup_uncharge_page(new_page);
-
-put_new:
page_cache_release(new_page);
put_old:
put_page(old_page);
@@ -723,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
int more = 0;
again:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
if (!prev && !more) {
/*
- * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
@@ -754,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
info->mm = vma->vm_mm;
info->vaddr = offset_to_vaddr(vma, offset);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (!more)
goto out;
@@ -1639,7 +1640,6 @@ bool uprobe_deny_signal(void)
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
set_tsk_thread_flag(t, TIF_UPROBE);
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..6873bb3e6b7e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,145 +20,10 @@
#include <linux/types.h>
#include <linux/fs_struct.h>
-
-static void default_handler(int, struct pt_regs *);
-
-static struct exec_domain *exec_domains = &default_exec_domain;
-static DEFINE_RWLOCK(exec_domains_lock);
-
-
-static unsigned long ident_map[32] = {
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31
-};
-
-struct exec_domain default_exec_domain = {
- .name = "Linux", /* name */
- .handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
- .pers_high = 0, /* PER_LINUX personality. */
- .signal_map = ident_map, /* Identity map signals. */
- .signal_invmap = ident_map, /* - both ways. */
-};
-
-
-static void
-default_handler(int segment, struct pt_regs *regp)
-{
- set_personality(0);
-
- if (current_thread_info()->exec_domain->handler != default_handler)
- current_thread_info()->exec_domain->handler(segment, regp);
- else
- send_sig(SIGSEGV, current, 1);
-}
-
-static struct exec_domain *
-lookup_exec_domain(unsigned int personality)
-{
- unsigned int pers = personality(personality);
- struct exec_domain *ep;
-
- read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep; ep = ep->next) {
- if (pers >= ep->pers_low && pers <= ep->pers_high)
- if (try_module_get(ep->module))
- goto out;
- }
-
-#ifdef CONFIG_MODULES
- read_unlock(&exec_domains_lock);
- request_module("personality-%d", pers);
- read_lock(&exec_domains_lock);
-
- for (ep = exec_domains; ep; ep = ep->next) {
- if (pers >= ep->pers_low && pers <= ep->pers_high)
- if (try_module_get(ep->module))
- goto out;
- }
-#endif
-
- ep = &default_exec_domain;
-out:
- read_unlock(&exec_domains_lock);
- return ep;
-}
-
-int
-register_exec_domain(struct exec_domain *ep)
-{
- struct exec_domain *tmp;
- int err = -EBUSY;
-
- if (ep == NULL)
- return -EINVAL;
-
- if (ep->next != NULL)
- return -EBUSY;
-
- write_lock(&exec_domains_lock);
- for (tmp = exec_domains; tmp; tmp = tmp->next) {
- if (tmp == ep)
- goto out;
- }
-
- ep->next = exec_domains;
- exec_domains = ep;
- err = 0;
-
-out:
- write_unlock(&exec_domains_lock);
- return err;
-}
-EXPORT_SYMBOL(register_exec_domain);
-
-int
-unregister_exec_domain(struct exec_domain *ep)
-{
- struct exec_domain **epp;
-
- epp = &exec_domains;
- write_lock(&exec_domains_lock);
- for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
- if (ep == *epp)
- goto unregister;
- }
- write_unlock(&exec_domains_lock);
- return -EINVAL;
-
-unregister:
- *epp = ep->next;
- ep->next = NULL;
- write_unlock(&exec_domains_lock);
- return 0;
-}
-EXPORT_SYMBOL(unregister_exec_domain);
-
-int __set_personality(unsigned int personality)
-{
- struct exec_domain *oep = current_thread_info()->exec_domain;
-
- current_thread_info()->exec_domain = lookup_exec_domain(personality);
- current->personality = personality;
- module_put(oep->module);
-
- return 0;
-}
-EXPORT_SYMBOL(__set_personality);
-
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
{
- struct exec_domain *ep;
-
- read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep; ep = ep->next)
- seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
- ep->pers_low, ep->pers_high, ep->name,
- module_name(ep->module));
- read_unlock(&exec_domains_lock);
+ seq_puts(m, "0-0\tLinux \t[kernel]\n");
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..22fcc05dec40 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
@@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk)
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
- /*
- * Accumulate here the counters for all threads but the
- * group leader as they die, so they can be added into
- * the process-wide totals when those are taken.
- * The group leader stays around as a zombie as long
- * as there are other threads. When it gets reaped,
- * the exit.c code will add its counts into these totals.
- * We won't ever get here for the group leader, since it
- * will have been the last reference on the signal_struct.
- */
- task_cputime(tsk, &utime, &stime);
- sig->utime += utime;
- sig->stime += stime;
- sig->gtime += task_gtime(tsk);
- sig->min_flt += tsk->min_flt;
- sig->maj_flt += tsk->maj_flt;
- sig->nvcsw += tsk->nvcsw;
- sig->nivcsw += tsk->nivcsw;
- sig->inblock += task_io_get_inblock(tsk);
- sig->oublock += task_io_get_oublock(tsk);
- task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
}
+ /*
+ * Accumulate here the counters for all threads as they die. We could
+ * skip the group leader because it is the last user of signal_struct,
+ * but we want to avoid the race with thread_group_cputime() which can
+ * see the empty ->thread_head list.
+ */
+ task_cputime(tsk, &utime, &stime);
+ write_seqlock(&sig->stats_lock);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->inblock += task_io_get_inblock(tsk);
+ sig->oublock += task_io_get_oublock(tsk);
+ task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
+ write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
@@ -151,7 +149,7 @@ static void __exit_signal(struct task_struct *tsk)
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
@@ -168,7 +166,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
}
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
{
struct task_struct *leader;
int zap_leader;
@@ -192,7 +190,8 @@ repeat:
*/
zap_leader = 0;
leader = p->group_leader;
- if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ if (leader != p && thread_group_empty(leader)
+ && leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -213,27 +212,6 @@ repeat:
}
/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
- struct task_struct *p;
- struct pid *sid = NULL;
-
- p = pid_task(pgrp, PIDTYPE_PGID);
- if (p == NULL)
- p = pid_task(pgrp, PIDTYPE_PID);
- if (p != NULL)
- sid = task_session(p);
-
- return sid;
-}
-
-/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -241,7 +219,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+ struct task_struct *ignored_task)
{
struct task_struct *p;
@@ -294,9 +273,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
struct task_struct *ignored_task = tsk;
if (!parent)
- /* exit: our father is in a different pgrp than
- * we are and we were the only connection outside.
- */
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
@@ -405,7 +384,7 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
@@ -425,6 +404,7 @@ static void exit_mm(struct task_struct * tsk)
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
+
up_read(&mm->mmap_sem);
self.task = tsk;
@@ -455,6 +435,46 @@ static void exit_mm(struct task_struct * tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
+ if (test_thread_flag(TIF_MEMDIE))
+ unmark_oom_victim();
+}
+
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ if (!(t->flags & PF_EXITING))
+ return t;
+ }
+ return NULL;
+}
+
+static struct task_struct *find_child_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *reaper = pid_ns->child_reaper;
+
+ if (likely(reaper != father))
+ return reaper;
+
+ reaper = find_alive_thread(father);
+ if (reaper) {
+ pid_ns->child_reaper = reaper;
+ return reaper;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?: father->exit_code);
+ }
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+
+ return father;
}
/*
@@ -464,58 +484,36 @@ static void exit_mm(struct task_struct * tsk)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+static struct task_struct *find_new_reaper(struct task_struct *father,
+ struct task_struct *child_reaper)
{
- struct pid_namespace *pid_ns = task_active_pid_ns(father);
- struct task_struct *thread;
+ struct task_struct *thread, *reaper;
- thread = father;
- while_each_thread(father, thread) {
- if (thread->flags & PF_EXITING)
- continue;
- if (unlikely(pid_ns->child_reaper == father))
- pid_ns->child_reaper = thread;
+ thread = find_alive_thread(father);
+ if (thread)
return thread;
- }
-
- if (unlikely(pid_ns->child_reaper == father)) {
- write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?:
- father->exit_code);
- }
-
- zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
- } else if (father->signal->has_child_subreaper) {
- struct task_struct *reaper;
+ if (father->signal->has_child_subreaper) {
/*
- * Find the first ancestor marked as child_subreaper.
- * Note that the code below checks same_thread_group(reaper,
- * pid_ns->child_reaper). This is what we need to DTRT in a
- * PID namespace. However we still need the check above, see
- * http://marc.info/?l=linux-kernel&m=131385460420380
+ * Find the first ->is_child_subreaper ancestor in our pid_ns.
+ * We start from father to ensure we can not look into another
+ * namespace, this is safe because all its threads are dead.
*/
- for (reaper = father->real_parent;
- reaper != &init_task;
+ for (reaper = father;
+ !same_thread_group(reaper, child_reaper);
reaper = reaper->real_parent) {
- if (same_thread_group(reaper, pid_ns->child_reaper))
+ /* call_usermodehelper() descendants need this check */
+ if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
- thread = reaper;
- do {
- if (!(thread->flags & PF_EXITING))
- return reaper;
- } while_each_thread(reaper, thread);
+ thread = find_alive_thread(reaper);
+ if (thread)
+ return thread;
}
}
- return pid_ns->child_reaper;
+ return child_reaper;
}
/*
@@ -524,15 +522,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- if (p->exit_state == EXIT_DEAD)
- return;
- /*
- * If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
+ if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
@@ -543,48 +533,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
- list_move_tail(&p->sibling, dead);
+ list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+ struct list_head *dead)
{
- struct task_struct *p, *n, *reaper;
- LIST_HEAD(dead_children);
+ struct task_struct *p, *t, *reaper;
- write_lock_irq(&tasklist_lock);
- /*
- * Note that exit_ptrace() and find_new_reaper() might
- * drop tasklist_lock and reacquire it.
- */
- exit_ptrace(father);
- reaper = find_new_reaper(father);
+ if (unlikely(!list_empty(&father->ptraced)))
+ exit_ptrace(father, dead);
- list_for_each_entry_safe(p, n, &father->children, sibling) {
- struct task_struct *t = p;
- do {
+ /* Can drop and reacquire tasklist_lock */
+ reaper = find_child_reaper(father);
+ if (list_empty(&father->children))
+ return;
+
+ reaper = find_new_reaper(father, reaper);
+ list_for_each_entry(p, &father->children, sibling) {
+ for_each_thread(p, t) {
t->real_parent = reaper;
- if (t->parent == father) {
- BUG_ON(t->ptrace);
+ BUG_ON((!t->ptrace) != (t->parent == father));
+ if (likely(!t->ptrace))
t->parent = t->real_parent;
- }
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t);
- } while_each_thread(p, t);
- reparent_leader(father, p, &dead_children);
- }
- write_unlock_irq(&tasklist_lock);
-
- BUG_ON(!list_empty(&father->children));
-
- list_for_each_entry_safe(p, n, &dead_children, sibling) {
- list_del_init(&p->sibling);
- release_task(p);
+ }
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (!same_thread_group(reaper, father))
+ reparent_leader(father, p, dead);
}
+ list_splice_tail_init(&father->children, &reaper->children);
}
/*
@@ -594,18 +589,12 @@ static void forget_original_parent(struct task_struct *father)
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
-
- /*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- */
- forget_original_parent(tsk);
+ struct task_struct *p, *n;
+ LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
+
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -623,15 +612,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+ if (tsk->exit_state == EXIT_DEAD)
+ list_add(&tsk->ptrace_entry, &dead);
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- /* If the process is dead, release it - nobody will wait for it */
- if (autoreap)
- release_task(tsk);
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
}
#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -648,9 +640,8 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s (%d) used greatest stack depth: "
- "%lu bytes left\n",
- current->comm, task_pid_nr(current), free);
+ pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -663,6 +654,7 @@ void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
+ TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
@@ -691,8 +683,7 @@ void do_exit(long code)
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
- printk(KERN_ALERT
- "Fixing recursive fault but reboot is needed!\n");
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
@@ -716,9 +707,9 @@ void do_exit(long code)
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
- printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
@@ -765,13 +756,12 @@ void do_exit(long code)
cgroup_exit(tsk);
- module_put(task_thread_info(tsk)->exec_domain->module);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
+ TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
@@ -811,6 +801,7 @@ void do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+ TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
/*
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
@@ -836,7 +827,6 @@ void do_exit(long code)
for (;;)
cpu_relax(); /* For when BUG is null */
}
-
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +836,6 @@ void complete_and_exit(struct completion *comp, long code)
do_exit(code);
}
-
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +858,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
+
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
@@ -976,8 +966,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- unsigned long state;
- int retval, status, traced;
+ int state, retval, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
@@ -991,6 +980,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
if ((exit_code & 0x7f) == 0) {
why = CLD_EXITED;
status = exit_code >> 8;
@@ -1000,21 +991,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
}
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
-
- traced = ptrace_reparented(p);
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+ EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
- * It can be ptraced but not reparented, check
- * thread_group_leader() to filter out sub-threads.
+ * We own this thread, nobody else can reap it.
*/
- if (likely(!traced) && thread_group_leader(p)) {
- struct signal_struct *psig;
- struct signal_struct *sig;
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ /*
+ * Check thread_group_leader() to exclude the traced sub-threads.
+ */
+ if (state == EXIT_DEAD && thread_group_leader(p)) {
+ struct signal_struct *sig = p->signal;
+ struct signal_struct *psig = current->signal;
unsigned long maxrss;
cputime_t tgutime, tgstime;
@@ -1026,21 +1021,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
- * p->signal fields, because they are only touched by
- * __exit_signal, which runs with tasklist_lock
- * write-locked anyway, and so is excluded here. We do
- * need to protect the access to parent->signal fields,
- * as other threads in the parent group can be right
- * here reaping other children at the same time.
+ * p->signal fields because the whole thread group is dead
+ * and nobody can change them.
+ *
+ * psig->stats_lock also protects us from our sub-theads
+ * which can reap other children at the same time. Until
+ * we change k_getrusage()-like users to rely on this lock
+ * we have to take ->siglock as well.
*
- * We use thread_group_cputime_adjusted() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
+ * We use thread_group_cputime_adjusted() to get times for
+ * the thread group, which consolidates times for all threads
+ * in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&p->real_parent->sighand->siglock);
- psig = p->real_parent->signal;
- sig = p->signal;
+ spin_lock_irq(&current->sighand->siglock);
+ write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1063,15 +1058,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- spin_unlock_irq(&p->real_parent->sighand->siglock);
+ write_sequnlock(&psig->stats_lock);
+ spin_unlock_irq(&current->sighand->siglock);
}
- /*
- * Now we are sure this task is interesting, and no other
- * thread can reap it because we its state == DEAD/TRACE.
- */
- read_unlock(&tasklist_lock);
-
retval = wo->wo_rusage
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1202,6 +1192,7 @@ unlock_sig:
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1264,6 +1255,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (!wo->wo_info) {
retval = wo->wo_rusage
@@ -1294,9 +1286,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
+ /*
+ * We can race with wait_task_zombie() from another thread.
+ * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+ * can't confuse the checks below.
+ */
+ int exit_state = ACCESS_ONCE(p->exit_state);
int ret;
- if (unlikely(p->exit_state == EXIT_DEAD))
+ if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, p);
@@ -1317,7 +1315,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (unlikely(p->exit_state == EXIT_TRACE)) {
+ if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
@@ -1344,7 +1342,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
}
/* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
@@ -1417,6 +1415,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
+
if (ret)
return ret;
}
@@ -1430,6 +1429,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
+
if (ret)
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
+#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
+ if (is_ftrace_trampoline(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return is_module_text_address(addr);
+ if (is_module_text_address(addr))
+ return 1;
+ return is_ftrace_trampoline(addr);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a13c46cd87d..03c1eaaa6ef5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
+#include <linux/sysctl.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -88,6 +89,16 @@
#include <trace/events/task.h>
/*
+ * Minimum number of threads to boot the kernel
+ */
+#define MIN_THREADS 20
+
+/*
+ * Maximum number of threads
+ */
+#define MAX_THREADS FUTEX_TID_MASK
+
+/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
@@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
void __init __weak arch_task_cache_init(void) { }
-void __init fork_init(unsigned long mempages)
+/*
+ * set_max_threads
+ */
+static void set_max_threads(unsigned int max_threads_suggested)
+{
+ u64 threads;
+
+ /*
+ * The number of threads shall be limited such that the thread
+ * structures may only consume a small part of the available memory.
+ */
+ if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ threads = MAX_THREADS;
+ else
+ threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ (u64) THREAD_SIZE * 8UL);
+
+ if (threads > max_threads_suggested)
+ threads = max_threads_suggested;
+
+ max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
+}
+
+void __init fork_init(void)
{
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
@@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages)
/* do the arch specific task caches init */
arch_task_cache_init();
- /*
- * The default maximum number of threads is set to a safe
- * value: the thread structures can take up at most half
- * of memory.
- */
- max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
-
- /*
- * we need to allow at least 20 threads to boot a system
- */
- if (max_threads < 20)
- max_threads = 20;
+ set_max_threads(MAX_THREADS);
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -294,11 +317,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
return 0;
}
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+ unsigned long *stackend;
+
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
+}
+
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
- unsigned long *stackend;
int node = tsk_fork_get_node(orig);
int err;
@@ -315,12 +345,20 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
- stackend = end_of_stack(tsk);
- *stackend = STACK_END_MAGIC; /* for overflow detection */
+ set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
@@ -365,12 +403,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->vmacache_seqnum = 0;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ /* No ordering required: file already has been exposed. */
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@ -419,19 +459,15 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- if (unlikely(tmp->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(tmp,
- &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
@@ -495,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-#define dup_mmap(mm, oldmm) (0)
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ down_write(&oldmm->mmap_sem);
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ up_write(&oldmm->mmap_sem);
+ return 0;
+}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
@@ -527,19 +569,38 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm_nr_pmds_init(mm);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -549,11 +610,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->def_flags = 0;
}
- if (likely(!mm_alloc_pgd(mm))) {
- mmu_notifier_mm_init(mm);
- return mm;
- }
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
free_mm(mm);
return NULL;
}
@@ -570,8 +637,15 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
+ if (atomic_long_read(&mm->nr_ptes))
+ pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
+ atomic_long_read(&mm->nr_ptes));
+ if (mm_nr_pmds(mm))
+ pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
+ mm_nr_pmds(mm));
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON(mm->pmd_huge_pte);
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -587,7 +661,6 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@ -633,34 +706,53 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
+ */
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
+ struct file *old_exe_file;
+
+ /*
+ * It is safe to dereference the exe_file without RCU as
+ * this function is only called if nobody else can access
+ * this mm -- see comment above for justification.
+ */
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+
if (new_exe_file)
get_file(new_exe_file);
- if (mm->exe_file)
- fput(mm->exe_file);
- mm->exe_file = new_exe_file;
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ if (old_exe_file)
+ fput(old_exe_file);
}
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of exe_file */
- down_read(&mm->mmap_sem);
- exe_file = mm->exe_file;
- if (exe_file)
- get_file(exe_file);
- up_read(&mm->mmap_sem);
+ rcu_read_lock();
+ exe_file = rcu_dereference(mm->exe_file);
+ if (exe_file && !get_file_rcu(exe_file))
+ exe_file = NULL;
+ rcu_read_unlock();
return exe_file;
}
-
-static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
- /* It's safe to write the exe_file pointer without exe_file_lock because
- * this is called during fork when the task is not yet in /proc */
- newmm->exe_file = get_mm_exe_file(oldmm);
-}
+EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
@@ -819,19 +911,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- mm->pmd_huge_pte = NULL;
-#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
- dup_mm_exe_file(oldmm, mm);
-
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
@@ -851,15 +934,6 @@ free_pt:
fail_nomem:
return NULL;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1002,11 +1076,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
{
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
+ /*
+ * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * without an RCU grace period, see __lock_task_sighand().
+ */
kmem_cache_free(sighand_cachep, sighand);
}
}
-
/*
* Initialize POSIX timer handling for a thread group.
*/
@@ -1053,6 +1130,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
+ seqlock_init(&sig->stats_lock);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
@@ -1081,6 +1159,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
+
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1095,17 +1206,9 @@ static void rt_mutex_init_task(struct task_struct *p)
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
- p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MEMCG
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
- mm->owner = p;
-}
-#endif /* CONFIG_MEMCG */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1196,7 +1299,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1226,9 +1328,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
- if (!try_module_get(task_thread_info(p)->exec_domain->module))
- goto bad_fork_cleanup_count;
-
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
p->flags |= PF_FORKNOEXEC;
@@ -1262,9 +1361,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time);
- p->real_start_time = p->start_time;
- monotonic_to_bootbased(&p->real_start_time);
+ p->start_time = ktime_get_ns();
+ p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
@@ -1307,10 +1405,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
-#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -1326,8 +1420,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_cleanup_perf;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1357,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
- if (!pid)
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
+ }
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1437,6 +1533,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_lock(&current->sighand->siglock);
/*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
@@ -1525,8 +1627,9 @@ bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
@@ -1534,7 +1637,6 @@ bad_fork_cleanup_threadgroup_lock:
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
delayacct_tsk_free(p);
- module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
@@ -1873,6 +1975,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
@@ -1943,3 +2050,26 @@ int unshare_files(struct files_struct **displaced)
task_unlock(task);
return 0;
}
+
+int sysctl_max_threads(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int ret;
+ int threads = max_threads;
+ int min = MIN_THREADS;
+ int max = MAX_THREADS;
+
+ t = *table;
+ t.data = &threads;
+ t.extra1 = &min;
+ t.extra2 = &max;
+
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ set_max_threads(threads);
+
+ return 0;
+}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
+ if (test_thread_flag(TIF_MEMDIE))
+ return false;
+
if (pm_nosig_freezing || cgroup_freezing(p))
return true;
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
- * be visible to @p as waking up implies wmb. Waking up inside
- * freezer_lock also prevents wakeups from leaking outside
- * refrigerator.
- */
spin_lock_irqsave(&freezer_lock, flags);
if (frozen(p))
wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index b632b5f3f094..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
*
* Where (A) orders the waiters increment and the futex value read through
* atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers in
- * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
- * futex type.
+ * to futex and the waiters read -- this is done by the barriers for both
+ * shared and private futexes in get_futex_key_refs().
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key)
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies MB (B) */
break;
+ default:
+ /*
+ * Private futexes do not hold reference on an inode or
+ * mm, therefore the only purpose of calling get_futex_key_refs
+ * is because we need the barrier for the lockless waiter check.
+ */
+ smp_mb(); /* explicit MB (B) */
}
}
/*
* Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
*/
static void drop_futex_key_refs(union futex_key *key)
{
@@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+/*
+ * Must be called with the hb lock held.
+ */
static void free_pi_state(struct futex_pi_state *pi_state)
{
+ if (!pi_state)
+ return;
+
if (!atomic_dec_and_test(&pi_state->refcount))
return;
@@ -792,94 +806,91 @@ void exit_pi_state_list(struct task_struct *curr)
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
*/
-static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
{
- struct futex_pi_state *pi_state = NULL;
- struct futex_q *this, *next;
- struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex(&this->key, key)) {
- /*
- * Sanity check the waiter before increasing
- * the refcount and attaching to it.
- */
- pi_state = this->pi_state;
- /*
- * Userspace might have messed up non-PI and
- * PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
- WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(!atomic_read(&pi_state->refcount));
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
/*
- * Handle the owner died case:
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
*/
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and
- * wakes the topmost waiter. The task which
- * acquires the pi_state->rt_mutex will fixup
- * owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user
- * space TID is not 0. Inconsistent
- * state. [5]
- */
- if (pid)
- return -EINVAL;
- /*
- * Take a ref on the state and
- * return. [4]
- */
- goto out_state;
- }
-
- /*
- * If TID is 0, then either the dying owner
- * has not yet executed exit_pi_state_list()
- * or some waiter acquired the rtmutex in the
- * pi state, but did not yet fixup the TID in
- * user space.
- *
- * Take a ref on the state and return. [6]
- */
- if (!pid)
- goto out_state;
- } else {
- /*
- * If the owner died bit is not set,
- * then the pi_state must have an
- * owner. [7]
- */
- if (!pi_state->owner)
- return -EINVAL;
- }
-
+ if (pid)
+ return -EINVAL;
/*
- * Bail out if user space manipulated the
- * futex value. If pi state exists then the
- * owner TID must be the same as the user
- * space TID. [9/10]
+ * Take a ref on the state and return success. [4]
*/
- if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-
- out_state:
- atomic_inc(&pi_state->refcount);
- *ps = pi_state;
- return 0;
+ goto out_state;
}
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ return -EINVAL;
}
/*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+out_state:
+ atomic_inc(&pi_state->refcount);
+ *ps = pi_state;
+ return 0;
+}
+
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct futex_pi_state *pi_state;
+ struct task_struct *p;
+
+ /*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
*/
@@ -889,7 +900,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
- if (!p->mm) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
return -EPERM;
}
@@ -920,7 +931,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
pi_state = alloc_pi_state();
/*
- * Initialize the pi_mutex in locked state and make 'p'
+ * Initialize the pi_mutex in locked state and make @p
* the owner of it:
*/
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -940,6 +951,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return 0;
}
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+{
+ struct futex_q *match = futex_top_waiter(hb, key);
+
+ /*
+ * If there is a waiter on that futex, validate it and
+ * attach to the pi_state when the validation succeeds.
+ */
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+ * @uval and attach to it.
+ */
+ return attach_to_pi_owner(uval, key, ps);
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ u32 uninitialized_var(curval);
+
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+ return -EFAULT;
+
+ /*If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
/**
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
@@ -963,113 +1004,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, force_take = 0;
- u32 uval, newval, curval, vpid = task_pid_vnr(task);
-
-retry:
- ret = lock_taken = 0;
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *match;
+ int ret;
/*
- * To avoid races, we attempt to take the lock here again
- * (by doing a 0 -> TID atomic cmpxchg), while holding all
- * the locks. It will most likely not succeed.
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
*/
- newval = vpid;
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
+ if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
- * Surprise - we got the lock, but we do not trust user space at all.
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
*/
- if (unlikely(!curval)) {
- /*
- * We verify whether there is kernel state for this
- * futex. If not, we can safely assume, that the 0 ->
- * TID transition is correct. If state exists, we do
- * not bother to fixup the user space state as it was
- * corrupted already.
- */
- return futex_top_waiter(hb, key) ? -EINVAL : 1;
- }
-
- uval = curval;
+ match = futex_top_waiter(hb, key);
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
/*
- * Set the FUTEX_WAITERS flag, so the owner will know it has someone
- * to wake at the next unlock.
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
*/
- newval = curval | FUTEX_WAITERS;
-
- /*
- * Should we force take the futex? See below.
- */
- if (unlikely(force_take)) {
+ if (!(uval & FUTEX_TID_MASK)) {
/*
- * Keep the OWNER_DIED and the WAITERS bit and set the
- * new TID value.
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
*/
- newval = (curval & ~FUTEX_TID_MASK) | vpid;
- force_take = 0;
- lock_taken = 1;
- }
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
- if (unlikely(curval != uval))
- goto retry;
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ /* If the take over worked, return 1 */
+ return ret < 0 ? ret : 1;
+ }
/*
- * We took the lock due to forced take over.
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
*/
- if (unlikely(lock_taken))
- return 1;
-
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
/*
- * We dont have the lock. Look up the PI state (or create it if
- * we are the first waiter):
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
*/
- ret = lookup_pi_state(uval, hb, key, ps);
-
- if (unlikely(ret)) {
- switch (ret) {
- case -ESRCH:
- /*
- * We failed to find an owner for this
- * futex. So we have no pi_state to block
- * on. This can happen in two cases:
- *
- * 1) The owner died
- * 2) A stale FUTEX_WAITERS bit
- *
- * Re-read the futex value.
- */
- if (get_futex_value_locked(&curval, uaddr))
- return -EFAULT;
-
- /*
- * If the owner died or we have a stale
- * WAITERS bit the owner TID in the user space
- * futex is 0.
- */
- if (!(curval & FUTEX_TID_MASK)) {
- force_take = 1;
- goto retry;
- }
- default:
- break;
- }
- }
-
- return ret;
+ return attach_to_pi_owner(uval, key, ps);
}
/**
@@ -1186,22 +1183,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
return 0;
}
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
- u32 uninitialized_var(oldval);
-
- /*
- * There is no waiter, so we unlock the futex. The owner died
- * bit has not to be preserved here. We are the owner:
- */
- if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
- return -EFAULT;
- if (oldval != uval)
- return -EAGAIN;
-
- return 0;
-}
-
/*
* Express the locking dependencies for lockdep:
*/
@@ -1552,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}
retry:
- if (pi_state != NULL) {
- /*
- * We will have to lookup the pi_state again, so free this one
- * to keep the accounting correct.
- */
- free_pi_state(pi_state);
- pi_state = NULL;
- }
-
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -1650,6 +1622,8 @@ retry_private:
case 0:
break;
case -EFAULT:
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1659,7 +1633,14 @@ retry_private:
goto retry;
goto out;
case -EAGAIN:
- /* The owner was exiting, try again. */
+ /*
+ * Two reasons for this:
+ * - Owner is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
+ */
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1718,7 +1699,7 @@ retry_private:
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
- this->task, 1);
+ this->task);
if (ret == 1) {
/* We got the lock. */
requeue_pi_wake_futex(this, &key2, hb2);
@@ -1736,6 +1717,7 @@ retry_private:
}
out_unlock:
+ free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -1753,8 +1735,6 @@ out_put_keys:
out_put_key1:
put_futex_key(&key1);
out:
- if (pi_state != NULL)
- free_pi_state(pi_state);
return ret ? ret : task_count;
}
@@ -2237,7 +2217,7 @@ retry:
if (!abs_time)
goto out;
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
@@ -2278,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
@@ -2316,8 +2296,10 @@ retry_private:
goto uaddr_faulted;
case -EAGAIN:
/*
- * Task is exiting and we just wait for the
- * exit to complete.
+ * Two reasons for this:
+ * - Task is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
@@ -2337,9 +2319,9 @@ retry_private:
/*
* Block on the PI mutex:
*/
- if (!trylock)
- ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
- else {
+ if (!trylock) {
+ ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+ } else {
ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
@@ -2401,10 +2383,10 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- u32 uval, vpid = task_pid_vnr(current);
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
int ret;
retry:
@@ -2417,57 +2399,47 @@ retry:
return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
+ if (ret)
+ return ret;
hb = hash_futex(&key);
spin_lock(&hb->lock);
/*
- * To avoid races, try to do the TID -> 0 atomic transition
- * again. If it succeeds then we can return without waking
- * anyone else up. We only try this if neither the waiters nor
- * the owner died bit are set.
- */
- if (!(uval & ~FUTEX_TID_MASK) &&
- cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
- goto pi_faulted;
- /*
- * Rare case: we managed to release the lock atomically,
- * no need to wake anyone else up:
- */
- if (unlikely(uval == vpid))
- goto out_unlock;
-
- /*
- * Ok, other tasks may need to be woken up - check waiters
- * and do the wakeup if necessary:
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
*/
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (!match_futex (&this->key, &key))
- continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
/*
- * The atomic access to the futex value
- * generated a pagefault, so retry the
- * user-access and the wakeup:
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
+
/*
- * No waiters - kernel unlocks the futex:
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
goto pi_faulted;
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
-
-out:
return ret;
pi_faulted:
@@ -2628,6 +2600,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* shared futexes. We need to compare the keys:
*/
if (match_futex(&q.key, &key2)) {
+ queue_unlock(hb);
ret = -EINVAL;
goto out_put_keys;
}
@@ -2669,7 +2642,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
debug_rt_mutex_free_waiter(&rt_waiter);
spin_lock(q.lock_ptr);
@@ -2980,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- return futex_lock_pi(uaddr, flags, val, timeout, 0);
+ return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+ return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
Note that the debugfs filesystem has to be mounted to access
profiling data.
+config ARCH_HAS_GCOV_PROFILE_ALL
+ def_bool n
+
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
+ depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-# if-lt
-# Usage VAR := $(call if-lt, $(a), $(b))
-# Returns 1 if (a < b)
-if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
-
-ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
- cc-ver := 0304
-else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
- cc-ver := 0407
-else
-# Use cc-version if available, otherwise set 0
-#
-# scripts/Kbuild.include, which contains cc-version function, is not included
-# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
-# Meaning cc-ver is empty causing if-lt test to fail with
-# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
-# This has no affect on the clean phase, but the error message could be
-# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
-# is not available. We can probably move if-lt to Kbuild.include, so it's also
-# not defined during clean or to include Kbuild.include in
-# scripts/Makefile.clean. But the following workaround seems least invasive.
- cc-ver := $(if $(call cc-version),$(call cc-version),0)
-endif
-
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
-
-ifeq ($(call if-lt, $(cc-ver), 0407),1)
- obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
-else
- obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
-endif
+obj-y := base.o fs.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
+ gcc_3_4.o, gcc_4_7.o)
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index b358a802fd18..a744098e4eb7 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#include "gcov.h"
static int gcov_events_enabled;
@@ -107,8 +108,10 @@ void gcov_enable_events(void)
gcov_events_enabled = 1;
/* Perform event callback for previously registered entries. */
- while ((info = gcov_info_next(info)))
+ while ((info = gcov_info_next(info))) {
gcov_event(GCOV_ADD, info);
+ cond_resched();
+ }
mutex_unlock(&gcov_lock);
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
err_remove:
pr_err("init failed\n");
- if (root_node.dentry)
- debugfs_remove(root_node.dentry);
+ debugfs_remove(root_node.dentry);
return rc;
}
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,11 +6,9 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *group_info;
@@ -213,6 +211,14 @@ out:
return i;
}
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
+}
+
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
@@ -223,7 +229,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
return;
rcu_read_lock();
- do_each_thread(g, t) {
+ for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
- } while_each_thread(g, t);
+ }
unlock:
rcu_read_unlock();
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for hierarchical irq domains
+config IRQ_DOMAIN_HIERARCHY
+ bool
+ select IRQ_DOMAIN
+
+# Generic MSI interrupt support
+config GENERIC_MSI_IRQ
+ bool
+
+# Generic MSI hierarchical interrupt domain support
+config GENERIC_MSI_IRQ_DOMAIN
+ bool
+ select IRQ_DOMAIN_HIERARCHY
+ select GENERIC_MSI_IRQ
+
+config HANDLE_DOMAIN_IRQ
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_PM_SLEEP) += pm.o
+obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2fd7b1..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/irqdomain.h>
#include <trace/events/irq.h>
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_state_clr_disabled(desc);
desc->depth = 0;
+ irq_domain_activate_irq(&desc->irq_data);
if (desc->irq_data.chip->irq_startup) {
ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
desc->irq_data.chip->irq_disable(&desc->irq_data);
else
desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_state_set_masked(desc);
}
@@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc)
return irq_wait_for_poll(desc);
}
+static bool irq_may_run(struct irq_desc *desc)
+{
+ unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+
+ /*
+ * If the interrupt is not in progress and is not an armed
+ * wakeup interrupt, proceed.
+ */
+ if (!irqd_has_set(&desc->irq_data, mask))
+ return true;
+
+ /*
+ * If the interrupt is an armed wakeup source, mark it pending
+ * and suspended, disable it and notify the pm core about the
+ * event.
+ */
+ if (irq_pm_check_wakeup(desc))
+ return false;
+
+ /*
+ * Handle a potential concurrent poll on a different core.
+ */
+ return irq_check_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out;
+ if (!irq_may_run(desc))
+ goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -517,6 +542,7 @@ out:
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
@@ -540,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
@@ -601,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
do {
@@ -669,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
- void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+ void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
irqreturn_t res;
kstat_incr_irqs_this_cpu(irq, desc);
@@ -698,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
if (!handle) {
handle = handle_bad_irq;
} else {
- if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+ struct irq_data *irq_data = &desc->irq_data;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /*
+ * With hierarchical domains we might run into a
+ * situation where the outermost chip is not yet set
+ * up, but the inner chips are there. Instead of
+ * bailing we install the handler, but obviously we
+ * cannot enable/startup the interrupt at this point.
+ */
+ while (irq_data) {
+ if (irq_data->chip != &no_irq_chip)
+ break;
+ /*
+ * Bail out if the outer chip is not set up
+ * and the interrrupt supposed to be started
+ * right away.
+ */
+ if (WARN_ON(is_chained))
+ goto out;
+ /* Try the parent */
+ irq_data = irq_data->parent_data;
+ }
+#endif
+ if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
goto out;
}
@@ -817,3 +873,121 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_chip_ack_parent - Acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_ack(data);
+}
+
+/**
+ * irq_chip_mask_parent - Mask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask(data);
+}
+
+/**
+ * irq_chip_unmask_parent - Unmask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_unmask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_eoi_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_eoi(data);
+}
+
+/**
+ * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Conditinal, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_affinity_parent(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_affinity)
+ return data->chip->irq_set_affinity(data, dest, force);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
+ * @data: Pointer to interrupt specific data
+ *
+ * Iterate through the domain hierarchy of the interrupt and check
+ * whether a hw retrigger function exists. If yes, invoke it.
+ */
+int irq_chip_retrigger_hierarchy(struct irq_data *data)
+{
+ for (data = data->parent_data; data; data = data->parent_data)
+ if (data->chip && data->chip->irq_retrigger)
+ return data->chip->irq_retrigger(data);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @on: Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_wake)
+ return data->chip->irq_set_wake(data, on);
+
+ return -ENOSYS;
+}
+#endif
+
+/**
+ * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * @data: Pointer to interrupt specific data
+ * @msg: Pointer to the MSI message
+ *
+ * For hierarchical domains we find the first chip in the hierarchy
+ * which implements the irq_compose_msi_msg callback. For non
+ * hierarchical we use the top level chip.
+ */
+int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct irq_data *pos = NULL;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ for (; data; data = data->parent_data)
+#endif
+ if (data->chip && data->chip->irq_compose_msi_msg)
+ pos = data;
+ if (!pos)
+ return -ENOSYS;
+
+ pos->chip->irq_compose_msi_msg(pos, msg);
+
+ return 0;
+}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef0606797c9..d5d0f7345c54 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
- * request_irq(). IRQs requested with this function will be
+ * request_threaded_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
+ irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache |= mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache &= ~mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
+ irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
u32 mask = ~d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
+ irq_reg_writel(gc, mask, ct->regs.eoi);
irq_gc_unlock(gc);
}
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
return 0;
}
+static u32 irq_readl_be(void __iomem *addr)
+{
+ return ioread32be(addr);
+}
+
+static void irq_writel_be(u32 val, void __iomem *addr)
+{
+ iowrite32be(val, addr);
+}
+
static void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
ct[i].mask_cache = mskptr;
if (flags & IRQ_GC_INIT_MASK_CACHE)
- *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+ *mskptr = irq_reg_readl(gc, mskreg);
}
}
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
dgc->gc[i] = gc = tmp;
irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
NULL, handler);
+
gc->domain = d;
+ if (gcflags & IRQ_GC_BE_IO) {
+ gc->reg_readl = &irq_readl_be;
+ gc->reg_writel = &irq_writel_be;
+ }
+
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
@@ -341,8 +357,8 @@ static struct lock_class_key irq_nested_lock_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
*/
-static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
- irq_hw_number_t hw_irq)
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+ irq_hw_number_t hw_irq)
{
struct irq_data *data = irq_get_irq_data(virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +410,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
+#include <linux/irqdomain.h>
#include "internals.h"
@@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
static inline void free_masks(struct irq_desc *desc) { }
#endif
+void irq_lock_sparse(void)
+{
+ mutex_lock(&sparse_irq_lock);
+}
+
+void irq_unlock_sparse(void)
+{
+ mutex_unlock(&sparse_irq_lock);
+}
+
static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
{
struct irq_desc *desc;
@@ -167,6 +178,12 @@ static void free_desc(unsigned int irq)
unregister_irq_proc(irq, desc);
+ /*
+ * sparse_irq_lock protects also show_interrupts() and
+ * kstat_irq_usr(). Once we deleted the descriptor from the
+ * sparse tree we can free it. Access in proc will fail to
+ * lookup the descriptor.
+ */
mutex_lock(&sparse_irq_lock);
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
@@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @lookup: Whether to perform the domain lookup or not
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+ bool lookup, struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq = hwirq;
+ int ret = 0;
+
+ irq_enter();
+
+#ifdef CONFIG_IRQ_DOMAIN
+ if (lookup)
+ irq = irq_find_mapping(domain, hwirq);
+#endif
+
+ /*
+ * Some hardware gives randomly wrong interrupts. Rather
+ * than crashing, do something sensible.
+ */
+ if (unlikely(!irq || irq >= nr_irqs)) {
+ ack_bad_irq(irq);
+ ret = -EINVAL;
+ } else {
+ generic_handle_irq(irq);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
+
/* Dynamic interrupt handling */
/**
@@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
+/**
+ * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
+ * @irq: The interrupt number
+ * @cpu: The cpu number
+ *
+ * Returns the sum of interrupt counts on @cpu since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+/**
+ * kstat_irqs - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
+
+/**
+ * kstat_irqs_usr - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. Contrary to kstat_irqs() this can be called from any
+ * preemptible context. It's protected against concurrent removal of
+ * an interrupt descriptor when sparse irqs are enabled.
+ */
+unsigned int kstat_irqs_usr(unsigned int irq)
+{
+ int sum;
+
+ irq_lock_sparse();
+ sum = kstat_irqs(irq);
+ irq_unlock_sparse();
+ return sum;
+}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+ irq_hw_number_t hwirq, int node);
+static void irq_domain_check_hierarchy(struct irq_domain *domain);
+
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
* direct mapping
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
domain->hwirq_max = hwirq_max;
domain->revmap_size = size;
domain->revmap_direct_max_irq = direct_max;
+ irq_domain_check_hierarchy(domain);
mutex_lock(&irq_domain_mutex);
list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
* pre-map all of the irqs in the domain to virqs starting at first_irq.
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
domain = __irq_domain_add(of_node, first_hwirq + size,
first_hwirq + size, 0, ops, host_data);
- if (!domain)
- return NULL;
-
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+ if (domain)
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
@@ -231,7 +234,7 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- unsigned int hint;
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- hint = hwirq % nr_irqs;
- if (hint == 0)
- hint++;
- virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
- if (virq <= 0)
- virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq,
+ of_node_to_nid(domain->of_node));
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
- unsigned int virq;
+ int virq;
domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
return 0;
}
- /* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
- if (!virq)
- return virq;
+ if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq)
+ return virq;
+
+ virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+ if (virq <= 0)
+ return 0;
+ } else {
+ /* Create mapping */
+ virq = irq_create_mapping(domain, hwirq);
+ if (!virq)
+ return virq;
+ }
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
return 0;
if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_get_irq_data(hwirq);
- if (data && (data->domain == domain) && (data->hwirq == hwirq))
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
return hwirq;
}
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+static int irq_domain_alloc_descs(int virq, unsigned int cnt,
+ irq_hw_number_t hwirq, int node)
+{
+ unsigned int hint;
+
+ if (virq >= 0) {
+ virq = irq_alloc_descs(virq, virq, cnt, node);
+ } else {
+ hint = hwirq % nr_irqs;
+ if (hint == 0)
+ hint++;
+ virq = irq_alloc_descs_from(hint, cnt, node);
+ if (virq <= 0 && hint > 1)
+ virq = irq_alloc_descs_from(1, cnt, node);
+ }
+
+ return virq;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * @parent: Parent irq domain to associate with the new domain
+ * @flags: Irq domain flags associated to the domain
+ * @size: Size of the domain. See below
+ * @node: Optional device-tree node of the interrupt controller
+ * @ops: Pointer to the interrupt domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * If @size is 0 a tree domain is created, otherwise a linear domain.
+ *
+ * If successful the parent is associated to the new domain and the
+ * domain flags are set.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+ unsigned int flags,
+ unsigned int size,
+ struct device_node *node,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ if (size)
+ domain = irq_domain_add_linear(node, size, ops, host_data);
+ else
+ domain = irq_domain_add_tree(node, ops, host_data);
+ if (domain) {
+ domain->parent = parent;
+ domain->flags |= flags;
+ }
+
+ return domain;
+}
+
+static void irq_domain_insert_irq(int virq)
+{
+ struct irq_data *data;
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && data->chip)
+ domain->name = data->chip->name;
+ }
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+}
+
+static void irq_domain_remove_irq(int virq)
+{
+ struct irq_data *data;
+
+ irq_set_status_flags(virq, IRQ_NOREQUEST);
+ irq_set_chip_and_handler(virq, NULL, NULL);
+ synchronize_irq(virq);
+ smp_mb();
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ }
+}
+
+static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
+ struct irq_data *child)
+{
+ struct irq_data *irq_data;
+
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ if (irq_data) {
+ child->parent_data = irq_data;
+ irq_data->irq = child->irq;
+ irq_data->node = child->node;
+ irq_data->domain = domain;
+ }
+
+ return irq_data;
+}
+
+static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data, *tmp;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ tmp = irq_data->parent_data;
+ irq_data->parent_data = NULL;
+ irq_data->domain = NULL;
+
+ while (tmp) {
+ irq_data = tmp;
+ tmp = tmp->parent_data;
+ kfree(irq_data);
+ }
+ }
+}
+
+static int irq_domain_alloc_irq_data(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct irq_domain *parent;
+ int i;
+
+ /* The outermost irq_data is embedded in struct irq_desc */
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ irq_data->domain = domain;
+
+ for (parent = domain->parent; parent; parent = parent->parent) {
+ irq_data = irq_domain_insert_irq_data(parent, irq_data);
+ if (!irq_data) {
+ irq_domain_free_irq_data(virq, i + 1);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data;
+
+ for (irq_data = irq_get_irq_data(virq); irq_data;
+ irq_data = irq_data->parent_data)
+ if (irq_data->domain == domain)
+ return irq_data;
+
+ return NULL;
+}
+
+/**
+ * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hwirq number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated chip data
+ */
+int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ if (!irq_data)
+ return -ENOENT;
+
+ irq_data->hwirq = hwirq;
+ irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip_data = chip_data;
+
+ return 0;
+}
+
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
+ __irq_set_handler(virq, handler, 0, handler_name);
+ irq_set_handler_data(virq, handler_data);
+}
+
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+
+/**
+ * irq_domain_free_irqs_common - Clear irq_data and free the parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data)
+ irq_domain_reset_irq_data(irq_data);
+ }
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+/**
+ * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_handler_data(virq + i, NULL);
+ irq_set_handler(virq + i, NULL);
+ }
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
+}
+
+static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs)
+{
+ domain->ops->free(domain, irq_base, nr_irqs);
+ if (irq_domain_is_auto_recursive(domain)) {
+ BUG_ON(!domain->parent);
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+ }
+}
+
+static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret = 0;
+ struct irq_domain *parent = domain->parent;
+ bool recursive = irq_domain_is_auto_recursive(domain);
+
+ BUG_ON(recursive && !parent);
+ if (recursive)
+ ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
+ nr_irqs, arg);
+ if (ret >= 0)
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0 && recursive)
+ irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
+
+ return ret;
+}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc)
+{
+ int i, ret, virq;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
+ if (realloc && irq_base >= 0) {
+ virq = irq_base;
+ } else {
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ if (virq < 0) {
+ pr_debug("cannot allocate IRQ(base %d, count %d)\n",
+ irq_base, nr_irqs);
+ return virq;
+ }
+ }
+
+ if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
+ pr_debug("cannot allocate memory for IRQ%d\n", virq);
+ ret = -ENOMEM;
+ goto out_free_desc;
+ }
+
+ mutex_lock(&irq_domain_mutex);
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
+ if (ret < 0) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_insert_irq(virq + i);
+ mutex_unlock(&irq_domain_mutex);
+
+ return virq;
+
+out_free_irq_data:
+ irq_domain_free_irq_data(virq, nr_irqs);
+out_free_desc:
+ irq_free_descs(virq, nr_irqs);
+ return ret;
+}
+
+/**
+ * irq_domain_free_irqs - Free IRQ number and associated data structures
+ * @virq: base IRQ number
+ * @nr_irqs: number of IRQs to free
+ */
+void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ int i;
+
+ if (WARN(!data || !data->domain || !data->domain->ops->free,
+ "NULL pointer, cannot free irq\n"))
+ return;
+
+ mutex_lock(&irq_domain_mutex);
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_remove_irq(virq + i);
+ irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_domain_free_irq_data(virq, nr_irqs);
+ irq_free_descs(virq, nr_irqs);
+}
+
+/**
+ * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to allocate
+ * @arg: Allocation data (arch/domain specific)
+ *
+ * Check whether the domain has been setup recursive. If not allocate
+ * through the parent domain.
+ */
+int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs,
+ void *arg)
+{
+ /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
+ if (irq_domain_is_auto_recursive(domain))
+ return 0;
+
+ domain = domain->parent;
+ if (domain)
+ return irq_domain_alloc_irqs_recursive(domain, irq_base,
+ nr_irqs, arg);
+ return -ENOSYS;
+}
+
+/**
+ * irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to free
+ *
+ * Check whether the domain has been setup recursive. If not free
+ * through the parent domain.
+ */
+void irq_domain_free_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs)
+{
+ /* irq_domain_free_irqs_recursive() will call parent's free */
+ if (!irq_domain_is_auto_recursive(domain) && domain->parent)
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+}
+
+/**
+ * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
+ * interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * This is the second step to call domain_ops->activate to program interrupt
+ * controllers, so the interrupt could actually get delivered.
+ */
+void irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+/**
+ * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
+ * deactivate interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * It calls domain_ops->deactivate to program interrupt controllers to disable
+ * interrupt delivery.
+ */
+void irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+ /* Hierarchy irq_domains must implement callback alloc() */
+ if (domain->ops->alloc)
+ domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
+}
+#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+
+ return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+}
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Do not use this for shutdown scenarios where you must be sure
* that all parts (hardirq and threaded handler) have completed.
*
+ * Returns: false if a threaded handler is active.
+ *
* This function may be called - with care - from IRQ context.
*/
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc)
+ if (desc) {
__synchronize_hardirq(desc);
+ return !atomic_read(&desc->threads_active);
+ }
+
+ return true;
}
EXPORT_SYMBOL(synchronize_hardirq);
@@ -183,6 +189,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
cpumask_copy(data->affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
irq_set_thread_affinity(desc);
@@ -242,6 +249,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
+ /* set the initial affinity to prevent every interrupt being on CPU0 */
+ if (m)
+ __irq_set_affinity(irq, m, false);
return 0;
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
@@ -382,14 +392,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
- return;
- desc->istate |= IRQS_SUSPENDED;
- }
-
if (!desc->depth++)
irq_disable(desc);
}
@@ -401,7 +405,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq, false);
+ __disable_irq(desc, irq);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -442,20 +446,34 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
+/**
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this function while
+ * holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ * When used to optimistically disable an interrupt from atomic context
+ * the return value must be checked.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
{
- if (resume) {
- if (!(desc->istate & IRQS_SUSPENDED)) {
- if (!desc->action)
- return;
- if (!(desc->action->flags & IRQF_FORCE_RESUME))
- return;
- /* Pretend that it got disabled ! */
- desc->depth++;
- }
- desc->istate &= ~IRQS_SUSPENDED;
- }
+ if (!__disable_irq_nosync(irq))
+ return synchronize_hardirq(irq);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
switch (desc->depth) {
case 0:
err_out:
@@ -497,7 +515,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -618,6 +636,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
@@ -1218,6 +1237,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
*old_ptr = new;
+ irq_pm_install_action(desc, new);
+
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -1228,7 +1249,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1357,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
+ irq_pm_remove_action(desc, action);
+
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_shutdown(desc);
@@ -1483,8 +1506,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
+ *
+ * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
+ * it cannot be set along with IRQF_NO_SUSPEND.
*/
- if ((irqflags & IRQF_SHARED) && !dev_id)
+ if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
+ ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
desc = irq_to_desc(irq);
@@ -1770,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+
+/**
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be storeed
+ *
+ * This call snapshots the internal irqchip state of an
+ * interrupt, returning into @state the bit corresponding to
+ * stage @which
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt,
+ * depending on the value of @which.
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool val)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_set_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_set_irqchip_state(data, which, val);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..474de5cb394d
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,337 @@
+/*
+ * linux/kernel/irq/msi.c
+ *
+ * Copyright (C) 2014 Intel Corp.
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ *
+ * This file contains common code to support Message Signalled Interrupt for
+ * PCI compatible and non PCI compatible devices.
+ */
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+/* Temparory solution for building, will be removed later */
+#include <linux/pci.h>
+
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+ *msg = entry->msg;
+}
+
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_get_msi_desc(irq);
+
+ __get_cached_msi_msg(entry, msg);
+}
+EXPORT_SYMBOL_GPL(get_cached_msi_msg);
+
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static inline void irq_chip_write_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ data->chip->irq_write_msi_msg(data, msg);
+}
+
+/**
+ * msi_domain_set_affinity - Generic affinity setter function for MSI domains
+ * @irq_data: The irq data associated to the interrupt
+ * @mask: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Intended to be used by MSI interrupt controllers which are
+ * implemented with hierarchical domains.
+ */
+int msi_domain_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct msi_msg msg;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+ }
+
+ return ret;
+}
+
+static void msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static void msi_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ memset(&msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
+ int i, ret;
+
+ if (irq_find_mapping(domain, hwirq) > 0)
+ return -EEXIST;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
+ if (ret < 0) {
+ if (ops->msi_free) {
+ for (i--; i > 0; i--)
+ ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct msi_domain_info *info = domain->host_data;
+ int i;
+
+ if (info->ops->msi_free) {
+ for (i = 0; i < nr_irqs; i++)
+ info->ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+static struct irq_domain_ops msi_domain_ops = {
+ .alloc = msi_domain_alloc,
+ .free = msi_domain_free,
+ .activate = msi_domain_activate,
+ .deactivate = msi_domain_deactivate,
+};
+
+#ifdef GENERIC_MSI_DOMAIN_OPS
+static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hwirq;
+}
+
+static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
+ struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+#else
+#define msi_domain_ops_get_hwirq NULL
+#define msi_domain_ops_prepare NULL
+#define msi_domain_ops_set_desc NULL
+#endif /* !GENERIC_MSI_DOMAIN_OPS */
+
+static int msi_domain_ops_init(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
+ info->chip_data);
+ if (info->handler && info->handler_name) {
+ __irq_set_handler(virq, info->handler, 0, info->handler_name);
+ if (info->handler_data)
+ irq_set_handler_data(virq, info->handler_data);
+ }
+ return 0;
+}
+
+static int msi_domain_ops_check(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ return 0;
+}
+
+static struct msi_domain_ops msi_domain_ops_default = {
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+};
+
+static void msi_domain_update_dom_ops(struct msi_domain_info *info)
+{
+ struct msi_domain_ops *ops = info->ops;
+
+ if (ops == NULL) {
+ info->ops = &msi_domain_ops_default;
+ return;
+ }
+
+ if (ops->get_hwirq == NULL)
+ ops->get_hwirq = msi_domain_ops_default.get_hwirq;
+ if (ops->msi_init == NULL)
+ ops->msi_init = msi_domain_ops_default.msi_init;
+ if (ops->msi_check == NULL)
+ ops->msi_check = msi_domain_ops_default.msi_check;
+ if (ops->msi_prepare == NULL)
+ ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->set_desc == NULL)
+ ops->set_desc = msi_domain_ops_default.set_desc;
+}
+
+static void msi_domain_update_chip_ops(struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ BUG_ON(!chip);
+ if (!chip->irq_mask)
+ chip->irq_mask = pci_msi_mask_irq;
+ if (!chip->irq_unmask)
+ chip->irq_unmask = pci_msi_unmask_irq;
+ if (!chip->irq_set_affinity)
+ chip->irq_set_affinity = msi_domain_set_affinity;
+}
+
+/**
+ * msi_create_irq_domain - Create a MSI interrupt domain
+ * @of_node: Optional device-tree node of the interrupt controller
+ * @info: MSI domain info
+ * @parent: Parent irq domain
+ */
+struct irq_domain *msi_create_irq_domain(struct device_node *node,
+ struct msi_domain_info *info,
+ struct irq_domain *parent)
+{
+ if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
+ info);
+}
+
+/**
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
+ */
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ msi_alloc_info_t arg;
+ struct msi_desc *desc;
+ int i, ret, virq = -1;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ if (ret)
+ return ret;
+
+ for_each_msi_entry(desc, dev) {
+ ops->set_desc(&arg, desc);
+ if (info->flags & MSI_FLAG_IDENTITY_MAP)
+ virq = (int)ops->get_hwirq(info, &arg);
+ else
+ virq = -1;
+
+ virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ dev_to_node(dev), &arg, false);
+ if (virq < 0) {
+ ret = -ENOSPC;
+ if (ops->handle_error)
+ ret = ops->handle_error(domain, desc, ret);
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, ret);
+ return ret;
+ }
+
+ for (i = 0; i < desc->nvec_used; i++)
+ irq_set_msi_desc_off(virq, i, desc);
+ }
+
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, 0);
+
+ for_each_msi_entry(desc, dev) {
+ if (desc->nvec_used == 1)
+ dev_dbg(dev, "irq %d for MSI\n", virq);
+ else
+ dev_dbg(dev, "irq [%d-%d] for MSI\n",
+ virq, virq + desc->nvec_used - 1);
+ }
+
+ return 0;
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_desc *desc;
+
+ for_each_msi_entry(desc, dev) {
+ /*
+ * We might have failed to allocate an MSI early
+ * enough that there is no IRQ associated to this
+ * entry. If that's the case, don't do anything.
+ */
+ if (desc->irq) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
+ }
+}
+
+/**
+ * msi_get_domain_info - Get the MSI interrupt domain info for @domain
+ * @domain: The interrupt domain to retrieve data from
+ *
+ * Returns the pointer to the msi_domain_info stored in
+ * @domain->host_data.
+ */
+struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
+{
+ return (struct msi_domain_info *)domain->host_data;
+}
+
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..5204a6d1b985 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,110 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include "internals.h"
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+ if (irqd_is_wakeup_armed(&desc->irq_data)) {
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_wakeup();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions++;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth++;
+
+ WARN_ON_ONCE(desc->force_resume_depth &&
+ desc->force_resume_depth != desc->nr_actions);
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth++;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth++;
+
+ WARN_ON_ONCE(desc->no_suspend_depth &&
+ (desc->no_suspend_depth +
+ desc->cond_suspend_depth) != desc->nr_actions);
+}
+
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions--;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth--;
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth--;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth--;
+}
+
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+ if (!desc->action || desc->no_suspend_depth)
+ return false;
+
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ /*
+ * We return true here to force the caller to issue
+ * synchronize_irq(). We need to make sure that the
+ * IRQD_WAKEUP_ARMED is visible before we return from
+ * suspend_device_irqs().
+ */
+ return true;
+ }
+
+ desc->istate |= IRQS_SUSPENDED;
+ __disable_irq(desc, irq);
+
+ /*
+ * Hardware which has no wakeup source configuration facility
+ * requires that the non wakeup interrupts are masked at the
+ * chip level. The chip implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ return true;
+}
+
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
- * During system-wide suspend or hibernation device drivers need to be prevented
- * from receiving interrupts and this function is provided for this purpose.
- * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQS_SUSPENDED flag for each of them.
+ * During system-wide suspend or hibernation device drivers need to be
+ * prevented from receiving interrupts and this function is provided
+ * for this purpose.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
*/
void suspend_device_irqs(void)
{
@@ -28,18 +121,36 @@ void suspend_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
+ bool sync;
raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, true);
+ sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
- for_each_irq_desc(irq, desc)
- if (desc->istate & IRQS_SUSPENDED)
+ if (sync)
synchronize_irq(irq);
+ }
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+
+ if (desc->istate & IRQS_SUSPENDED)
+ goto resume;
+
+ /* Force resume the interrupt? */
+ if (!desc->force_resume_depth)
+ return;
+
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+resume:
+ desc->istate &= ~IRQS_SUSPENDED;
+ __enable_irq(desc, irq);
+}
+
static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
@@ -54,7 +165,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- __enable_irq(desc, irq, true);
+ resume_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -93,38 +204,3 @@ void resume_device_irqs(void)
resume_irqs(false);
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
-
-/**
- * check_wakeup_irqs - check if any wake-up interrupts are pending
- */
-int check_wakeup_irqs(void)
-{
- struct irq_desc *desc;
- int irq;
-
- for_each_irq_desc(irq, desc) {
- /*
- * Only interrupts which are marked as wakeup source
- * and have not been disabled before the suspend check
- * can abort suspend.
- */
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->depth == 1 && desc->istate & IRQS_PENDING)
- return -EBUSY;
- continue;
- }
- /*
- * Check the non wakeup interrupts whether they need
- * to be masked before finally going into suspend
- * state. That's for hardware which has no wakeup
- * source configuration facility. The chip
- * implementation indicates that with
- * IRQCHIP_MASK_ON_SUSPEND.
- */
- if (desc->istate & IRQS_SUSPENDED &&
- irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
- mask_irq(desc);
- }
-
- return 0;
-}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
#include "internals.h"
+/*
+ * Access rules:
+ *
+ * procfs protects read/write of /proc/irq/N/ files against a
+ * concurrent free of the interrupt descriptor. remove_proc_entry()
+ * immediately prevents new read/writes to happen and waits for
+ * already running read/write functions to complete.
+ *
+ * We remove the proc entries first and then delete the interrupt
+ * descriptor from the radix tree and free it. So it is guaranteed
+ * that irq_to_desc(N) is valid as long as the read/writes are
+ * permitted by procfs.
+ *
+ * The read from /proc/interrupts is a different problem because there
+ * is no protection. So the lookup and the access to irqdesc
+ * information must be protected by sparse_irq_lock.
+ */
static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
@@ -29,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
mask = desc->pending_mask;
#endif
if (type)
- seq_cpumask_list(m, mask);
+ seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
else
- seq_cpumask(m, mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
return 0;
}
@@ -50,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
cpumask_copy(mask, desc->affinity_hint);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- seq_cpumask(m, mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
return 0;
@@ -169,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
static int default_affinity_show(struct seq_file *m, void *v)
{
- seq_cpumask(m, irq_default_affinity);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
return 0;
}
@@ -437,9 +451,10 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
+ irq_lock_sparse();
desc = irq_to_desc(i);
if (!desc)
- return 0;
+ goto outsparse;
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
@@ -479,6 +494,8 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
out:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+outsparse:
+ irq_unlock_sparse();
return 0;
}
#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/smp.h>
#include <asm/processor.h>
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-static DEFINE_PER_CPU(int, irq_work_raised);
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
/*
* Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
*/
}
+#ifdef CONFIG_SMP
/*
- * Enqueue the irq_work @entry unless it's already pending
+ * Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
*
* Can be re-enqueued while the callback is still in progress.
*/
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+
+/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
- llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
- /*
- * If the work is not "lazy" or the tick is stopped, raise the irq
- * work interrupt (if supported by the arch), otherwise, just wait
- * for the next tick.
- */
- if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
- if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
@@ -90,11 +111,14 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
bool irq_work_needs_cpu(void)
{
- struct llist_head *this_list;
+ struct llist_head *raised, *lazy;
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
- return false;
+ raised = this_cpu_ptr(&raised_list);
+ lazy = this_cpu_ptr(&lazy_list);
+
+ if (llist_empty(raised) || arch_irq_work_has_interrupt())
+ if (llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -102,28 +126,18 @@ bool irq_work_needs_cpu(void)
return true;
}
-static void __irq_work_run(void)
+static void irq_work_run_list(struct llist_head *list)
{
unsigned long flags;
struct irq_work *work;
- struct llist_head *this_list;
struct llist_node *llnode;
+ BUG_ON(!irqs_disabled());
- /*
- * Reset the "raised" state right before we check the list because
- * an NMI may enqueue after we find the list empty from the runner.
- */
- __this_cpu_write(irq_work_raised, 0);
- barrier();
-
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
+ if (llist_empty(list))
return;
- BUG_ON(!irqs_disabled());
-
- llnode = llist_del_all(this_list);
+ llnode = llist_del_all(list);
while (llnode != NULL) {
work = llist_entry(llnode, struct irq_work, llnode);
@@ -149,16 +163,25 @@ static void __irq_work_run(void)
}
/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
+ * hotplug calls this through:
+ * hotplug_cfd() -> flush_smp_call_function_queue()
*/
void irq_work_run(void)
{
- BUG_ON(!in_irq());
- __irq_work_run();
+ irq_work_run_list(this_cpu_ptr(&raised_list));
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
+void irq_work_tick(void)
+{
+ struct llist_head *raised = this_cpu_ptr(&raised_list);
+
+ if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+ irq_work_run_list(raised);
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
@@ -171,35 +194,3 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int irq_work_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_DYING:
- /* Called from stop_machine */
- if (WARN_ON_ONCE(cpu != smp_processor_id()))
- break;
- __irq_work_run();
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_notify;
-
-static __init int irq_work_init_cpu_notifier(void)
-{
- cpu_notify.notifier_call = irq_work_cpu_notify;
- cpu_notify.priority = 0;
- register_cpu_notifier(&cpu_notify);
- return 0;
-}
-device_initcall(irq_work_init_cpu_notifier);
-
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..5c5987f10819 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
- return sprintf(buffer, "0x%lx", address);
+ return sprintf(buffer, "0x%lx", address - symbol_offset);
if (name != buffer)
strcpy(buffer, name);
@@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file)
* using get_symbol_offset for every symbol.
*/
struct kallsym_iter *iter;
- int ret;
-
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
if (!iter)
return -ENOMEM;
reset_iter(iter, 0);
- ret = seq_open(file, &kallsyms_op);
- if (ret == 0)
- ((struct seq_file *)file->private_data)->private = iter;
- else
- kfree(iter);
- return ret;
+ return 0;
}
#ifdef CONFIG_KGDB_KDB
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
*/
static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
{
- long ret;
+ long t1, t2;
- ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+ t1 = kptr_obfuscate((long)v1, type);
+ t2 = kptr_obfuscate((long)v2, type);
- return (ret < 0) | ((ret > 0) << 1);
+ return (t1 < t2) | ((t1 > t2) << 1);
}
/* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -40,6 +42,9 @@
#include <asm/io.h>
#include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+#ifdef CONFIG_KEXEC_FILE
+static int kexec_calculate_store_digests(struct kimage *image);
+#endif
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_user_segment_list(struct kimage *image,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
+ int ret;
size_t segment_bytes;
- struct kimage *image;
- unsigned long i;
- int result;
-
- /* Allocate a controlling structure */
- result = -ENOMEM;
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- goto out;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->start = entry;
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
image->nr_segments = nr_segments;
segment_bytes = nr_segments * sizeof(*segments);
- result = copy_from_user(image->segment, segments, segment_bytes);
- if (result) {
- result = -EFAULT;
- goto out;
- }
+ ret = copy_from_user(image->segment, segments, segment_bytes);
+ if (ret)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
/*
* Verify we have good destination addresses. The caller is
@@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- goto out;
+ return result;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- goto out;
+ return result;
}
/* Verify our destination addresses do not overlap.
@@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- goto out;
+ return result;
}
}
@@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- goto out;
+ return result;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
- return result;
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+static struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
}
static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
{
- int result;
+ int ret;
struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+
+ if (kexec_on_panic) {
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ return -EADDRNOTAVAIL;
+ }
/* Allocate and initialize a controlling structure */
- image = NULL;
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
- goto out;
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->start = entry;
+
+ ret = copy_user_segment_list(image, nr_segments, segments);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
+ /* Enable the special crash kernel control page allocation policy. */
+ if (kexec_on_panic) {
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
* counted as destination pages.
*/
- result = -ENOMEM;
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_image;
}
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free;
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
kimage_free_page_list(&image->control_pages);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+#ifdef CONFIG_KEXEC_FILE
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
- int result;
- struct kimage *image;
- unsigned long i;
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
- image = NULL;
- /* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
- result = -EADDRNOTAVAIL;
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
goto out;
}
- /* Allocate and initialize a controlling structure */
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
goto out;
+ }
- /* Enable the special crash kernel control page
- * allocation policy.
- */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out_free;
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
}
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
/*
- * Find a location for the control code buffer, and add
- * the vector of segments so that it's pages will also be
- * counted as destination pages.
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
*/
- result = -ENOMEM;
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
static int kimage_is_destination_range(struct kimage *image,
unsigned long start,
@@ -583,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
destination &= PAGE_MASK;
result = kimage_add_entry(image, destination | IND_DESTINATION);
- if (result == 0)
- image->destination = destination;
return result;
}
@@ -596,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
page &= PAGE_MASK;
result = kimage_add_entry(image, page | IND_SOURCE);
- if (result == 0)
- image->destination += PAGE_SIZE;
return result;
}
@@ -609,7 +878,7 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->dest_pages);
/* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unuseable_pages);
+ kimage_free_page_list(&image->unusable_pages);
}
static void kimage_terminate(struct kimage *image)
@@ -663,6 +932,14 @@ static void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
kfree(image);
}
@@ -732,7 +1009,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page cannot be used file it away */
if (page_to_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unuseable_pages);
+ list_add(&page->lru, &image->unusable_pages);
continue;
}
addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1068,14 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -826,7 +1107,11 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- result = copy_from_user(ptr, buf, uchunk);
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
result = -EFAULT;
@@ -834,7 +1119,10 @@ static int kimage_load_normal_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -851,10 +1139,14 @@ static int kimage_load_crash_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -877,7 +1169,12 @@ static int kimage_load_crash_segment(struct kimage *image,
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- result = copy_from_user(ptr, buf, uchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
if (result) {
@@ -886,7 +1183,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -984,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > 0) {
unsigned long i;
- /* Loading another kernel to reboot into */
- if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_normal_alloc(&image, entry,
- nr_segments, segments);
- /* Loading another kernel to switch to if this one crashes */
- else if (flags & KEXEC_ON_CRASH) {
- /* Free any current crash dump kernel before
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
* we corrupt it.
*/
+
kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_crash_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
crash_map_reserved_pages();
+ } else {
+ /* Loading another kernel to reboot into. */
+
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
}
if (result)
goto out;
@@ -1077,6 +1380,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
}
#endif
+#ifdef CONFIG_KEXEC_FILE
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+#endif /* CONFIG_KEXEC_FILE */
+
void crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1376,7 +1758,6 @@ static __initdata char *suffix_tbl[] = {
*/
static int __init parse_crashkernel_suffix(char *cmdline,
unsigned long long *crash_size,
- unsigned long long *crash_base,
const char *suffix)
{
char *cur = cmdline;
@@ -1465,7 +1846,7 @@ static int __init __parse_crashkernel(char *cmdline,
if (suffix)
return parse_crashkernel_suffix(ck_cmdline, crash_size,
- crash_base, suffix);
+ suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1632,6 +2013,672 @@ static int __init crash_save_vmcoreinfo_init(void)
subsys_initcall(crash_save_vmcoreinfo_init);
+#ifdef CONFIG_KEXEC_FILE
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance. There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...)
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
/*
* This is the task which runs the usermode application
*/
@@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data)
retval = -ENOMEM;
new = prepare_kernel_cred(current);
if (!new)
- goto fail;
+ goto out;
spin_lock(&umh_sysctl_lock);
new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data)
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
- goto fail;
+ goto out;
}
}
@@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data)
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
if (!retval)
return 0;
-
- /* Exec failed? */
-fail:
- sub_info->retval = retval;
do_exit(0);
}
-static int call_helper(void *data)
-{
- /* Worker thread started blocking khelper thread. */
- kmod_thread_locker = current;
- return ____call_usermodehelper(data);
-}
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
- /* CLONE_VFORK: wait until the usermode helper has execve'd
- * successfully We need the data structures to stay around
- * until that is done. */
- if (wait == UMH_WAIT_PROC)
+ if (sub_info->wait & UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else {
- pid = kernel_thread(call_helper, sub_info,
- CLONE_VFORK | SIGCHLD);
- /* Worker thread stopped blocking khelper thread. */
- kmod_thread_locker = NULL;
- }
-
- switch (wait) {
- case UMH_NO_WAIT:
- call_usermodehelper_freeinfo(sub_info);
- break;
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ SIGCHLD);
- case UMH_WAIT_PROC:
- if (pid > 0)
- break;
- /* FALLTHROUGH */
- case UMH_WAIT_EXEC:
- if (pid < 0)
- sub_info->retval = pid;
+ if (pid < 0) {
+ sub_info->retval = pid;
umh_complete(sub_info);
}
}
@@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
goto out;
}
/*
- * Worker thread must not wait for khelper thread at below
- * wait_for_completion() if the thread was created with CLONE_VFORK
- * flag, for khelper thread is already waiting for the thread at
- * wait_for_completion() in do_fork().
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
*/
- if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
- retval = -EBUSY;
- goto out;
- }
-
- sub_info->complete = &done;
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 734e9a7d280b..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
static void free_insn_page(void *page)
{
- module_free(NULL, page);
+ module_memfree(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- arch_prepare_optimized_kprobe(op);
+ arch_prepare_optimized_kprobe(op, p);
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
- arch_prepare_optimized_kprobe(op);
+ arch_prepare_optimized_kprobe(op, p);
return &op->kp;
}
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
- unoptimize_kprobe(p, false); /* Try to unoptimize */
+ /* Try to unoptimize */
+ unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
@@ -915,7 +916,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
- .flags = FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
static int kprobe_ftrace_enabled;
@@ -1410,16 +1411,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+int __weak arch_check_ftrace_location(struct kprobe *p)
{
- int ret = 0;
unsigned long ftrace_addr;
- /*
- * If the address is located on a ftrace nop, set the
- * breakpoint to the following instruction.
- */
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1426,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
return -EINVAL;
#endif
}
+ return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
+{
+ int ret;
+
+ ret = arch_check_ftrace_location(p);
+ if (ret)
+ return ret;
jump_label_lock();
preempt_disable();
@@ -1567,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- disarm_kprobe(orig_p, true);
+ /*
+ * If kprobes_all_disarmed is set, orig_p
+ * should have already been disarmed, so
+ * skip unneed disarming process.
+ */
+ if (!kprobes_all_disarmed)
+ disarm_kprobe(orig_p, true);
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
@@ -1778,7 +1789,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
- /*TODO: consider to only swap the RA after the last pre_handler fired */
+ /*
+ * To avoid deadlocks, prohibit return probing in NMI contexts,
+ * just skip the probe and increase the (inexact) 'nmissed'
+ * statistical counter, so that the user is informed that
+ * something happened:
+ */
+ if (unlikely(in_nmi())) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ /* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
@@ -2305,6 +2327,12 @@ static void arm_all_kprobes(void)
if (!kprobes_all_disarmed)
goto already_enabled;
+ /*
+ * optimize_kprobe() called by arm_kprobe() checks
+ * kprobes_all_disarmed, so set kprobes_all_disarmed before
+ * arm_kprobe.
+ */
+ kprobes_all_disarmed = false;
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
@@ -2313,7 +2341,6 @@ static void arm_all_kprobes(void)
arm_kprobe(p);
}
- kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
already_enabled:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
{
struct task_struct *p;
- p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
list_add_tail(&work->node, pos);
work->worker = worker;
- if (likely(worker->task))
+ if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config HAVE_LIVEPATCH
+ bool
+ help
+ Arch supports kernel live patching
+
+config LIVEPATCH
+ bool "Kernel Live Patching"
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on MODULES
+ depends on SYSFS
+ depends on KALLSYMS_ALL
+ depends on HAVE_LIVEPATCH
+ help
+ Say Y here if you want to support kernel live patching.
+ This option has no runtime impact until a kernel "patch"
+ module uses the interface provided by this option to register
+ a patch, causing calls to patched functions to be redirected
+ to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..284e2691e380
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1003 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them. References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
+static DEFINE_MUTEX(klp_mutex);
+
+static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
+
+static struct kobject *klp_root_kobj;
+
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_addr == old_addr)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static bool klp_is_module(struct klp_object *obj)
+{
+ return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+ struct module *mod;
+
+ if (!klp_is_module(obj))
+ return;
+
+ mutex_lock(&module_mutex);
+ /*
+ * We do not want to block removal of patched modules and therefore
+ * we do not take a reference here. The patches are removed by
+ * a going module handler instead.
+ */
+ mod = find_module(obj->name);
+ /*
+ * Do not mess work of the module coming and going notifiers.
+ * Note that the patch might still be needed before the going handler
+ * is called. Module functions can be called even in the GOING state
+ * until mod->exit() finishes. This is especially important for
+ * patches that modify semantic of the functions.
+ */
+ if (mod && mod->klp_alive)
+ obj->mod = mod;
+
+ mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+ struct klp_patch *mypatch;
+
+ list_for_each_entry(mypatch, &klp_patches, list)
+ if (mypatch == patch)
+ return true;
+
+ return false;
+}
+
+static bool klp_initialized(void)
+{
+ return klp_root_kobj;
+}
+
+struct klp_find_arg {
+ const char *objname;
+ const char *name;
+ unsigned long addr;
+ /*
+ * If count == 0, the symbol was not found. If count == 1, a unique
+ * match was found and addr is set. If count > 1, there is
+ * unresolvable ambiguity among "count" number of symbols with the same
+ * name in the same object.
+ */
+ unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ /*
+ * args->addr might be overwritten if another match is found
+ * but klp_find_object_symbol() handles this and only returns the
+ * addr if count == 1.
+ */
+ args->addr = addr;
+ args->count++;
+
+ return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+ unsigned long *addr)
+{
+ struct klp_find_arg args = {
+ .objname = objname,
+ .name = name,
+ .addr = 0,
+ .count = 0
+ };
+
+ kallsyms_on_each_symbol(klp_find_callback, &args);
+
+ if (args.count == 0)
+ pr_err("symbol '%s' not found in symbol table\n", name);
+ else if (args.count > 1)
+ pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+ args.count, name, objname);
+ else {
+ *addr = args.addr;
+ return 0;
+ }
+
+ *addr = 0;
+ return -EINVAL;
+}
+
+struct klp_verify_args {
+ const char *name;
+ const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_verify_args *args = data;
+
+ if (!mod &&
+ !strcmp(args->name, name) &&
+ args->addr == addr)
+ return 1;
+
+ return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+ struct klp_verify_args args = {
+ .name = name,
+ .addr = addr,
+ };
+
+ if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+ return 0;
+
+ pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+ name, addr);
+ return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+ struct klp_func *func)
+{
+ int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+ /* KASLR is enabled, disregard old_addr from user */
+ func->old_addr = 0;
+#endif
+
+ if (!func->old_addr || klp_is_module(obj))
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ &func->old_addr);
+ else
+ ret = klp_verify_vmlinux_symbol(func->old_name,
+ func->old_addr);
+
+ return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+ unsigned long *addr)
+{
+ const struct kernel_symbol *sym;
+
+ /* first, check if it's an exported symbol */
+ preempt_disable();
+ sym = find_symbol(name, NULL, NULL, true, true);
+ if (sym) {
+ *addr = sym->value;
+ preempt_enable();
+ return 0;
+ }
+ preempt_enable();
+
+ /* otherwise check if it's in another .o within the patch module */
+ return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+ struct klp_object *obj)
+{
+ int ret;
+ struct klp_reloc *reloc;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ if (WARN_ON(!obj->relocs))
+ return -EINVAL;
+
+ for (reloc = obj->relocs; reloc->name; reloc++) {
+ if (!klp_is_module(obj)) {
+ ret = klp_verify_vmlinux_symbol(reloc->name,
+ reloc->val);
+ if (ret)
+ return ret;
+ } else {
+ /* module, reloc->val needs to be discovered */
+ if (reloc->external)
+ ret = klp_find_external_symbol(pmod,
+ reloc->name,
+ &reloc->val);
+ else
+ ret = klp_find_object_symbol(obj->mod->name,
+ reloc->name,
+ &reloc->val);
+ if (ret)
+ return ret;
+ }
+ ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+ reloc->val + reloc->addend);
+ if (ret) {
+ pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+ reloc->name, reloc->val, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct pt_regs *regs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ rcu_read_lock();
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (WARN_ON_ONCE(!func))
+ goto unlock;
+
+ klp_arch_set_pc(regs, (unsigned long)func->new_func);
+unlock:
+ rcu_read_unlock();
+}
+
+static void klp_disable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+
+ WARN_ON(func->state != KLP_ENABLED);
+ WARN_ON(!func->old_addr);
+
+ ops = klp_find_ops(func->old_addr);
+ if (WARN_ON(!ops))
+ return;
+
+ if (list_is_singular(&ops->func_stack)) {
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->state = KLP_DISABLED;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ if (WARN_ON(func->state != KLP_DISABLED))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (!ops) {
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_IPMODIFY;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->state = KLP_ENABLED;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+static void klp_disable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name; func++)
+ if (func->state == KLP_ENABLED)
+ klp_disable_func(func);
+
+ obj->state = KLP_DISABLED;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->state != KLP_DISABLED))
+ return -EINVAL;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_enable_func(func);
+ if (ret) {
+ klp_disable_object(obj);
+ return ret;
+ }
+ }
+ obj->state = KLP_ENABLED;
+
+ return 0;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ /* enforce stacking: only the last enabled patch can be disabled */
+ if (!list_is_last(&patch->list, &klp_patches) &&
+ list_next_entry(patch, list)->state == KLP_ENABLED)
+ return -EBUSY;
+
+ pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (obj->state == KLP_ENABLED)
+ klp_disable_object(obj);
+ }
+
+ patch->state = KLP_DISABLED;
+
+ return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch: The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (patch->state == KLP_DISABLED) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_disable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (WARN_ON(patch->state != KLP_DISABLED))
+ return -EINVAL;
+
+ /* enforce stacking: only the first disabled patch can be enabled */
+ if (patch->list.prev != &klp_patches &&
+ list_prev_entry(patch, list)->state == KLP_DISABLED)
+ return -EBUSY;
+
+ pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+ add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_enable_object(obj);
+ if (ret)
+ goto unregister;
+ }
+
+ patch->state = KLP_ENABLED;
+
+ return 0;
+
+unregister:
+ WARN_ON(__klp_disable_patch(patch));
+ return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch: The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_enable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ if (val != KLP_DISABLED && val != KLP_ENABLED)
+ return -EINVAL;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ if (val == patch->state) {
+ /* already in requested state */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (val == KLP_ENABLED) {
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+ } else {
+ ret = __klp_disable_patch(patch);
+ if (ret)
+ goto err;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+ &enabled_kobj_attr.attr,
+ NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+ /*
+ * Once we have a consistency model we'll need to module_put() the
+ * patch module here. See klp_register_patch() for more details.
+ */
+}
+
+static struct kobj_type klp_ktype_patch = {
+ .release = klp_kobj_release_patch,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_func = {
+ .release = klp_kobj_release_func,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+ struct klp_func *limit)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name && func != limit; func++)
+ kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ obj->mod = NULL;
+
+ for (func = obj->funcs; func->old_name; func++)
+ func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+ struct klp_object *limit)
+{
+ struct klp_object *obj;
+
+ for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+ klp_free_funcs_limited(obj, NULL);
+ kobject_put(obj->kobj);
+ }
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+ klp_free_objects_limited(patch, NULL);
+ if (!list_empty(&patch->list))
+ list_del(&patch->list);
+ kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+ INIT_LIST_HEAD(&func->stack_node);
+ func->state = KLP_DISABLED;
+
+ return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ obj->kobj, "%s", func->old_name);
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (obj->relocs) {
+ ret = klp_write_object_relocations(patch->mod, obj);
+ if (ret)
+ return ret;
+ }
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_find_verify_func_addr(obj, func);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+ const char *name;
+
+ if (!obj->funcs)
+ return -EINVAL;
+
+ obj->state = KLP_DISABLED;
+ obj->mod = NULL;
+
+ klp_find_object_module(obj);
+
+ name = klp_is_module(obj) ? obj->name : "vmlinux";
+ obj->kobj = kobject_create_and_add(name, &patch->kobj);
+ if (!obj->kobj)
+ return -ENOMEM;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_init_func(obj, func);
+ if (ret)
+ goto free;
+ }
+
+ if (klp_is_object_loaded(obj)) {
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ return 0;
+
+free:
+ klp_free_funcs_limited(obj, func);
+ kobject_put(obj->kobj);
+ return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (!patch->objs)
+ return -EINVAL;
+
+ mutex_lock(&klp_mutex);
+
+ patch->state = KLP_DISABLED;
+
+ ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+ klp_root_kobj, "%s", patch->mod->name);
+ if (ret)
+ goto unlock;
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ ret = klp_init_object(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ list_add_tail(&patch->list, &klp_patches);
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+free:
+ klp_free_objects_limited(patch, obj);
+ kobject_put(&patch->kobj);
+unlock:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch: Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+ int ret = 0;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (patch->state == KLP_ENABLED) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ klp_free_patch(patch);
+
+out:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch: Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ if (!klp_initialized())
+ return -ENODEV;
+
+ if (!patch || !patch->mod)
+ return -EINVAL;
+
+ /*
+ * A reference is taken on the patch module to prevent it from being
+ * unloaded. Right now, we don't allow patch modules to unload since
+ * there is currently no method to determine if a thread is still
+ * running in the patched code contained in the patch module once
+ * the ftrace registration is successful.
+ */
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ ret = klp_init_patch(patch);
+ if (ret)
+ module_put(patch->mod);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+ int ret;
+
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto err;
+
+ if (patch->state == KLP_DISABLED)
+ return;
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ pmod->name, mod->name);
+
+ ret = klp_enable_object(obj);
+ if (!ret)
+ return;
+
+err:
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+
+ if (patch->state == KLP_DISABLED)
+ goto disabled;
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ pmod->name, mod->name);
+
+ klp_disable_object(obj);
+
+disabled:
+ klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct module *mod = data;
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+ return 0;
+
+ mutex_lock(&klp_mutex);
+
+ /*
+ * Each module has to know that the notifier has been called.
+ * We never know what module will get patched by a new patch.
+ */
+ if (action == MODULE_STATE_COMING)
+ mod->klp_alive = true;
+ else /* MODULE_STATE_GOING */
+ mod->klp_alive = false;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ if (action == MODULE_STATE_COMING) {
+ obj->mod = mod;
+ klp_module_notify_coming(patch, obj);
+ } else /* MODULE_STATE_GOING */
+ klp_module_notify_going(patch, obj);
+
+ break;
+ }
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+ .notifier_call = klp_module_notify,
+ .priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+ int ret;
+
+ ret = klp_check_compiler_support();
+ if (ret) {
+ pr_info("Your compiler is too old; turning off.\n");
+ return -EINVAL;
+ }
+
+ ret = register_module_notifier(&klp_module_nb);
+ if (ret)
+ return ret;
+
+ klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+ if (!klp_root_kobj) {
+ ret = -ENOMEM;
+ goto unregister;
+ }
+
+ return 0;
+
+unregister:
+ unregister_module_notifier(&klp_module_nb);
+ return ret;
+}
+
+module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d24e4339b46d..a0831e1b99f4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
}
static int save_trace(struct stack_trace *trace)
@@ -549,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock)
static void print_lock(struct held_lock *hlock)
{
- print_lock_name(hlock_class(hlock));
+ /*
+ * We can be called locklessly through debug_show_all_locks() so be
+ * extra careful, the hlock might have been released and cleared.
+ */
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+ barrier();
+
+ if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ printk("<RELEASED>\n");
+ return;
+ }
+
+ print_lock_name(lock_classes + class_idx - 1);
printk(", at: ");
print_ip_sym(hlock->acquire_ip);
}
@@ -631,7 +647,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -698,10 +714,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
hash_head = classhashentry(key);
/*
- * We can walk the hash lockfree, because the hash only
- * grows, and we are careful when adding entries to the end:
+ * We do an RCU walk of the hash, see lockdep_free_key_range().
*/
- list_for_each_entry(class, hash_head, hash_entry) {
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -726,7 +744,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct list_head *hash_head;
struct lock_class *class;
- unsigned long flags;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
if (likely(class))
@@ -748,28 +767,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
key = lock->key->subkeys + subclass;
hash_head = classhashentry(key);
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
/*
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry(class, hash_head, hash_entry)
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
+ }
+
/*
* Allocate a new key from the static array, and add it to
* the hash:
*/
if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
if (!debug_locks_off_graph_unlock()) {
- raw_local_irq_restore(flags);
return NULL;
}
- raw_local_irq_restore(flags);
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
@@ -796,7 +813,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
- raw_local_irq_restore(flags);
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
@@ -804,15 +820,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("\n");
dump_stack();
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
}
out_unlock_set:
graph_unlock();
- raw_local_irq_restore(flags);
out_set_class_cache:
if (!subclass || force)
@@ -868,11 +881,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
entry->distance = distance;
entry->trace = *trace;
/*
- * Since we never remove from the dependency list, the list can
- * be walked lockless by other CPUs, it's only allocation
- * that must be protected by the spinlock. But this also means
- * we must make new entries visible only once writes to the
- * entry become visible - hence the RCU op:
+ * Both allocation and removal are done under the graph lock; but
+ * iteration is under RCU-sched; see look_up_lock_class() and
+ * lockdep_free_key_range().
*/
list_add_tail_rcu(&entry->entry, head);
@@ -1023,7 +1034,9 @@ static int __bfs(struct lock_list *source_entry,
else
head = &lock->class->locks_before;
- list_for_each_entry(entry, head, entry) {
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ list_for_each_entry_rcu(entry, head, entry) {
if (!lock_accessed(entry)) {
unsigned int cq_depth;
mark_lock_accessed(entry, lock);
@@ -2020,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ list_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2994,8 +3007,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (unlikely(!debug_locks))
return;
- if (subclass)
+ if (subclass) {
+ unsigned long flags;
+
+ if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
register_lock_class(lock, subclass, 1);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+ }
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3885,9 +3908,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
void lockdep_free_key_range(void *start, unsigned long size)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i;
@@ -3903,7 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3914,11 +3945,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
if (locked)
graph_unlock();
raw_local_irq_restore(flags);
+
+ /*
+ * Wait for any possible iterators from look_up_lock_class() to pass
+ * before continuing to free the memory they refer to.
+ *
+ * sync_sched() is sufficient because the read-side is IRQ disable.
+ */
+ synchronize_sched();
+
+ /*
+ * XXX at this point we could return the resources to the pool;
+ * instead we leak them. We would need to change to bitmap allocators
+ * instead of the linear allocators we have now.
+ */
}
void lockdep_reset_lock(struct lockdep_map *lock)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i, j;
@@ -3946,7 +3991,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0955b885d0dc..ec8cce259779 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -20,30 +20,20 @@
* Author: Paul E. McKenney <paulmck@us.ibm.com>
* Based on kernel/rcu/torture.c.
*/
-#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/module.h>
#include <linux/kthread.h>
-#include <linux/err.h>
#include <linux/spinlock.h>
+#include <linux/rwlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
#include <linux/delay.h>
-#include <linux/stat.h>
#include <linux/slab.h>
-#include <linux/trace_clock.h>
-#include <asm/byteorder.h>
#include <linux/torture.h>
MODULE_LICENSE("GPL");
@@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
torture_param(int, nwriters_stress, -1,
"Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1,
+ "Number of read-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (s), 0=disable");
@@ -66,30 +58,28 @@ torture_param(bool, verbose, true,
static char *torture_type = "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
- "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
-
-static atomic_t n_lock_torture_errors;
+ "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
-static int nrealwriters_stress;
static bool lock_is_write_held;
+static bool lock_is_read_held;
-struct lock_writer_stress_stats {
- long n_write_lock_fail;
- long n_write_lock_acquired;
+struct lock_stress_stats {
+ long n_lock_fail;
+ long n_lock_acquired;
};
-static struct lock_writer_stress_stats *lwsa;
#if defined(MODULE)
#define LOCKTORTURE_RUNNABLE_INIT 1
#else
#define LOCKTORTURE_RUNNABLE_INIT 0
#endif
-int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
-module_param(locktorture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
+int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -102,12 +92,25 @@ struct lock_torture_ops {
int (*writelock)(void);
void (*write_delay)(struct torture_random_state *trsp);
void (*writeunlock)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *trsp);
+ void (*readunlock)(void);
unsigned long flags;
const char *name;
};
-static struct lock_torture_ops *cur_ops;
-
+struct lock_torture_cxt {
+ int nrealwriters_stress;
+ int nrealreaders_stress;
+ bool debug_lock;
+ atomic_t n_lock_torture_errors;
+ struct lock_torture_ops *cur_ops;
+ struct lock_stress_stats *lwsa; /* writer statistics */
+ struct lock_stress_stats *lrsa; /* reader statistics */
+};
+static struct lock_torture_cxt cxt = { 0, 0, false,
+ ATOMIC_INIT(0),
+ NULL, NULL};
/*
* Definitions for lock torture testing.
*/
@@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2000 * longdelay_us)))
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
mdelay(longdelay_us);
#ifdef CONFIG_PREEMPT
- if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
#endif
}
@@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
.writeunlock = torture_lock_busted_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "lock_busted"
};
@@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2000 * longdelay_us)))
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
mdelay(longdelay_us);
if (!(torture_random(trsp) %
- (nrealwriters_stress * 2 * shortdelay_us)))
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
- if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
#endif
}
@@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
.writeunlock = torture_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "spin_lock"
};
static int torture_spin_lock_write_lock_irq(void)
-__acquires(torture_spinlock_irq)
+__acquires(torture_spinlock)
{
unsigned long flags;
spin_lock_irqsave(&torture_spinlock, flags);
- cur_ops->flags = flags;
+ cxt.cur_ops->flags = flags;
return 0;
}
static void torture_lock_spin_write_unlock_irq(void)
__releases(torture_spinlock)
{
- spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+ spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
.writeunlock = torture_lock_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
.name = "spin_lock_irq"
};
+static DEFINE_RWLOCK(torture_rwlock);
+
+static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+{
+ write_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+{
+ write_unlock(&torture_rwlock);
+}
+
+static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+{
+ read_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 10;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+{
+ read_unlock(&torture_rwlock);
+}
+
+static struct lock_torture_ops rw_lock_ops = {
+ .writelock = torture_rwlock_write_lock,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock,
+ .readlock = torture_rwlock_read_lock,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock,
+ .name = "rw_lock"
+};
+
+static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_write_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_read_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops rw_lock_irq_ops = {
+ .writelock = torture_rwlock_write_lock_irq,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock_irq,
+ .readlock = torture_rwlock_read_lock_irq,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock_irq,
+ .name = "rw_lock_irq"
+};
+
+static DEFINE_MUTEX(torture_mutex);
+
+static int torture_mutex_lock(void) __acquires(torture_mutex)
+{
+ mutex_lock(&torture_mutex);
+ return 0;
+}
+
+static void torture_mutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 5);
+ else
+ mdelay(longdelay_ms / 5);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_mutex_unlock(void) __releases(torture_mutex)
+{
+ mutex_unlock(&torture_mutex);
+}
+
+static struct lock_torture_ops mutex_lock_ops = {
+ .writelock = torture_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .writeunlock = torture_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "mutex_lock"
+};
+
+static DECLARE_RWSEM(torture_rwsem);
+static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+{
+ down_write(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 10);
+ else
+ mdelay(longdelay_ms / 10);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+{
+ up_write(&torture_rwsem);
+}
+
+static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+{
+ down_read(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 2);
+ else
+ mdelay(longdelay_ms / 2);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+{
+ up_read(&torture_rwsem);
+}
+
+static struct lock_torture_ops rwsem_lock_ops = {
+ .writelock = torture_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .writeunlock = torture_rwsem_up_write,
+ .readlock = torture_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_rwsem_up_read,
+ .name = "rwsem_lock"
+};
+
/*
* Lock torture writer kthread. Repeatedly acquires and releases
* the lock, checking for duplicate acquisitions.
*/
static int lock_torture_writer(void *arg)
{
- struct lock_writer_stress_stats *lwsp = arg;
+ struct lock_stress_stats *lwsp = arg;
static DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
@@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg)
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cur_ops->writelock();
+
+ cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_write_lock_fail++;
+ lwsp->n_lock_fail++;
lock_is_write_held = 1;
- lwsp->n_write_lock_acquired++;
- cur_ops->write_delay(&rand);
+ if (WARN_ON_ONCE(lock_is_read_held))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ cxt.cur_ops->write_delay(&rand);
lock_is_write_held = 0;
- cur_ops->writeunlock();
+ cxt.cur_ops->writeunlock();
+
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
torture_kthread_stopping("lock_torture_writer");
@@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg)
}
/*
+ * Lock torture reader kthread. Repeatedly acquires and releases
+ * the reader lock.
+ */
+static int lock_torture_reader(void *arg)
+{
+ struct lock_stress_stats *lrsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->readlock();
+ lock_is_read_held = 1;
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lrsp->n_lock_fail++; /* rare, but... */
+
+ lrsp->n_lock_acquired++;
+ cxt.cur_ops->read_delay(&rand);
+ lock_is_read_held = 0;
+ cxt.cur_ops->readunlock();
+
+ stutter_wait("lock_torture_reader");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_reader");
+ return 0;
+}
+
+/*
* Create an lock-torture-statistics message in the specified buffer.
*/
-static void lock_torture_printk(char *page)
+static void __torture_print_stats(char *page,
+ struct lock_stress_stats *statp, bool write)
{
bool fail = 0;
- int i;
+ int i, n_stress;
long max = 0;
- long min = lwsa[0].n_write_lock_acquired;
+ long min = statp[0].n_lock_acquired;
long long sum = 0;
- for (i = 0; i < nrealwriters_stress; i++) {
- if (lwsa[i].n_write_lock_fail)
+ n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
+ for (i = 0; i < n_stress; i++) {
+ if (statp[i].n_lock_fail)
fail = true;
- sum += lwsa[i].n_write_lock_acquired;
- if (max < lwsa[i].n_write_lock_fail)
- max = lwsa[i].n_write_lock_fail;
- if (min > lwsa[i].n_write_lock_fail)
- min = lwsa[i].n_write_lock_fail;
+ sum += statp[i].n_lock_acquired;
+ if (max < statp[i].n_lock_fail)
+ max = statp[i].n_lock_fail;
+ if (min > statp[i].n_lock_fail)
+ min = statp[i].n_lock_fail;
}
- page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
page += sprintf(page,
- "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ write ? "Writes" : "Reads ",
sum, max, min, max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
- atomic_inc(&n_lock_torture_errors);
+ atomic_inc(&cxt.n_lock_torture_errors);
}
/*
@@ -274,18 +533,35 @@ static void lock_torture_printk(char *page)
*/
static void lock_torture_stats_print(void)
{
- int size = nrealwriters_stress * 200 + 8192;
+ int size = cxt.nrealwriters_stress * 200 + 8192;
char *buf;
+ if (cxt.cur_ops->readlock)
+ size += cxt.nrealreaders_stress * 200 + 8192;
+
buf = kmalloc(size, GFP_KERNEL);
if (!buf) {
pr_err("lock_torture_stats_print: Out of memory, need: %d",
size);
return;
}
- lock_torture_printk(buf);
+
+ __torture_print_stats(buf, cxt.lwsa, true);
pr_alert("%s", buf);
kfree(buf);
+
+ if (cxt.cur_ops->readlock) {
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lrsa, false);
+ pr_alert("%s", buf);
+ kfree(buf);
+ }
}
/*
@@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
- torture_type, tag, nrealwriters_stress, stat_interval, verbose,
- shuffle_interval, stutter, shutdown_secs,
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, cxt.debug_lock ? " [debug]": "",
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
+ verbose, shuffle_interval, stutter, shutdown_secs,
onoff_interval, onoff_holdoff);
}
@@ -322,46 +599,59 @@ static void lock_torture_cleanup(void)
{
int i;
- if (torture_cleanup())
+ if (torture_cleanup_begin())
return;
if (writer_tasks) {
- for (i = 0; i < nrealwriters_stress; i++)
+ for (i = 0; i < cxt.nrealwriters_stress; i++)
torture_stop_kthread(lock_torture_writer,
writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
+ if (reader_tasks) {
+ for (i = 0; i < cxt.nrealreaders_stress; i++)
+ torture_stop_kthread(lock_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+
torture_stop_kthread(lock_torture_stats, stats_task);
lock_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (atomic_read(&n_lock_torture_errors))
- lock_torture_print_module_parms(cur_ops,
+ if (atomic_read(&cxt.n_lock_torture_errors))
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: FAILURE");
else if (torture_onoff_failures())
- lock_torture_print_module_parms(cur_ops,
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: LOCK_HOTPLUG");
else
- lock_torture_print_module_parms(cur_ops,
+ lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+ torture_cleanup_end();
}
static int __init lock_torture_init(void)
{
- int i;
+ int i, j;
int firsterr = 0;
static struct lock_torture_ops *torture_ops[] = {
- &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+ &lock_busted_ops,
+ &spin_lock_ops, &spin_lock_irq_ops,
+ &rw_lock_ops, &rw_lock_irq_ops,
+ &mutex_lock_ops,
+ &rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
- cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0)
+ cxt.cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cxt.cur_ops->name) == 0)
break;
}
if (i == ARRAY_SIZE(torture_ops)) {
@@ -374,31 +664,69 @@ static int __init lock_torture_init(void)
torture_init_end();
return -EINVAL;
}
- if (cur_ops->init)
- cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+ if (cxt.cur_ops->init)
+ cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nwriters_stress >= 0)
- nrealwriters_stress = nwriters_stress;
+ cxt.nrealwriters_stress = nwriters_stress;
else
- nrealwriters_stress = 2 * num_online_cpus();
- lock_torture_print_module_parms(cur_ops, "Start of test");
+ cxt.nrealwriters_stress = 2 * num_online_cpus();
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ if (strncmp(torture_type, "mutex", 5) == 0)
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if ((strncmp(torture_type, "spin", 4) == 0) ||
+ (strncmp(torture_type, "rw_lock", 7) == 0))
+ cxt.debug_lock = true;
+#endif
/* Initialize the statistics so that each run gets its own numbers. */
lock_is_write_held = 0;
- lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
- if (lwsa == NULL) {
- VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+ cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- for (i = 0; i < nrealwriters_stress; i++) {
- lwsa[i].n_write_lock_fail = 0;
- lwsa[i].n_write_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
}
- /* Start up the kthreads. */
+ if (cxt.cur_ops->readlock) {
+ if (nreaders_stress >= 0)
+ cxt.nrealreaders_stress = nreaders_stress;
+ else {
+ /*
+ * By default distribute evenly the number of
+ * readers and writers. We still run the same number
+ * of threads as the writer-only locks default.
+ */
+ if (nwriters_stress < 0) /* user doesn't care */
+ cxt.nrealwriters_stress = num_online_cpus();
+ cxt.nrealreaders_stress = cxt.nrealwriters_stress;
+ }
+
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
+ }
+ lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
+ /* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ);
@@ -422,18 +750,51 @@ static int __init lock_torture_init(void)
goto unwind;
}
- writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+ writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- for (i = 0; i < nrealwriters_stress; i++) {
- firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+
+ if (cxt.cur_ops->readlock) {
+ reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ }
+
+ /*
+ * Create the kthreads and start torturing (oh, those poor little locks).
+ *
+ * TODO: Note that we interleave writers with readers, giving writers a
+ * slight advantage, by creating its kthread first. This can be modified
+ * for very specific needs, or even let the user choose the policy, if
+ * ever wanted.
+ */
+ for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
+ j < cxt.nrealreaders_stress; i++, j++) {
+ if (i >= cxt.nrealwriters_stress)
+ goto create_reader;
+
+ /* Create writer. */
+ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
writer_tasks[i]);
if (firsterr)
goto unwind;
+
+ create_reader:
+ if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
+ continue;
+ /* Create reader. */
+ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
+ reader_tasks[j]);
+ if (firsterr)
+ goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 74356dc0ce29..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -27,7 +27,7 @@ struct mcs_spinlock {
#define arch_mcs_spin_lock_contended(l) \
do { \
while (!(smp_load_acquire(l))) \
- arch_mutex_cpu_relax(); \
+ cpu_relax_lowlatency(); \
} while (0)
#endif
@@ -56,9 +56,6 @@ do { \
* If the lock has already been acquired, then this will proceed to spin
* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
static inline
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
@@ -81,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
*/
return;
}
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
arch_mcs_spin_lock_contended(&node->locked);
@@ -94,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
static inline
void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+ struct mcs_spinlock *next = READ_ONCE(node->next);
if (likely(!next)) {
/*
@@ -103,28 +100,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax_lowlatency();
}
/* Pass lock to next waiter. */
arch_mcs_spin_unlock_contended(&next->locked);
}
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-
-struct optimistic_spin_node {
- struct optimistic_spin_node *next, *prev;
- int locked; /* 1 if lock acquired */
- int cpu; /* encoded CPU # value */
-};
-
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
-
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- mutex_clear_owner(lock);
}
/*
* __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
* mutexes so that we can do it here after we've verified state.
*/
+ mutex_clear_owner(lock);
atomic_set(&lock->count, 1);
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index acca2c1a3c5e..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
@@ -25,7 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -46,12 +46,6 @@
# include <asm/mutex.h>
#endif
-/*
- * A negative mutex count indicates that waiters are sleeping waiting for the
- * mutex.
- */
-#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
-
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -87,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
* The mutex must later on be released by the same task that
* acquired it. Recursive locking is not allowed. The task
* may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
* the mutex still locked. The mutex must first be initialized
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
@@ -112,56 +106,146 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
/*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
- * more or less simultaneously, the spinners need to acquire a MCS lock
- * first before spinning on the owner field.
+ * After acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
*
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
*/
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ unsigned long flags;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ lock->ctx = ctx;
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the atomic read is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Check if lock is contended, if not there is nobody to wake up
+ */
+ if (likely(atomic_read(&lock->base.count) == 0))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, wake up everyone in this case,
+ * so they can see the new lock->ctx.
+ */
+ spin_lock_mutex(&lock->base.wait_lock, flags);
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+ spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
/*
- * Mutex spinning code migrated from kernel/sched/core.c
+ * After acquiring lock in the slowpath set ctx and wake up any
+ * waiters so they can recheck.
+ *
+ * Callers must hold the mutex wait_lock.
*/
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
{
- if (lock->owner != owner)
- return false;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+ lock->ctx = ctx;
/*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
*/
- barrier();
-
- return owner->on_cpu;
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
+ bool ret = true;
+
rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
break;
+ }
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
rcu_read_unlock();
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ return ret;
}
/*
@@ -176,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
return 0;
rcu_read_lock();
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner)
retval = owner->on_cpu;
rcu_read_unlock();
@@ -186,6 +270,140 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
return retval;
}
+
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+ return !mutex_is_locked(lock) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+
+ if (!mutex_can_spin_on_owner(lock))
+ goto done;
+
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
+ if (!osq_lock(&lock->osq))
+ goto done;
+
+ while (true) {
+ struct task_struct *owner;
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ */
+ if (READ_ONCE(ww->ctx))
+ break;
+ }
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = READ_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ /* Try to acquire the mutex if it is unlocked. */
+ if (mutex_try_to_acquire(lock)) {
+ lock_acquired(&lock->dep_map, ip);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww;
+ ww = container_of(lock, struct ww_mutex, base);
+
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ }
+
+ mutex_set_owner(lock);
+ osq_unlock(&lock->osq);
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+
+ osq_unlock(&lock->osq);
+done:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
+ schedule_preempt_disabled();
+ }
+
+ return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ return false;
+}
#endif
__visible __used noinline
@@ -260,10 +478,10 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
EXPORT_SYMBOL(ww_mutex_unlock);
static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
if (!hold_ctx)
return 0;
@@ -283,91 +501,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
return 0;
}
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
- struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
-}
-
-/*
- * after acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
-{
- unsigned long flags;
- struct mutex_waiter *cur;
-
- ww_mutex_lock_acquired(lock, ctx);
-
- lock->ctx = ctx;
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* ^^^ */
-
- /*
- * Check if lock is contended, if not there is nobody to wake up
- */
- if (likely(atomic_read(&lock->base.count) == 0))
- return;
-
- /*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
- */
- spin_lock_mutex(&lock->base.wait_lock, flags);
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
- spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
@@ -384,109 +517,19 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- /*
- * Optimistic spinning.
- *
- * We try to spin for acquisition when we find that there are no
- * pending waiters and the lock owner is currently running on a
- * (different) CPU.
- *
- * The rationale is that if the lock owner is running, it is likely to
- * release the lock soon.
- *
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
- * The mutex spinners are queued up using MCS lock so that only one
- * spinner can compete for the mutex. However, if mutex spinning isn't
- * going to happen, there is no point in going through the lock/unlock
- * overhead.
- */
- if (!mutex_can_spin_on_owner(lock))
- goto slowpath;
-
- if (!osq_lock(&lock->osq))
- goto slowpath;
-
- for (;;) {
- struct task_struct *owner;
-
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- /*
- * If ww->ctx is set the contents are undefined, only
- * by acquiring wait_lock there is a guarantee that
- * they are not invalid when reading.
- *
- * As such, when deadlock detection needs to be
- * performed the optimistic spinning cannot be done.
- */
- if (ACCESS_ONCE(ww->ctx))
- break;
- }
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
-
- if ((atomic_read(&lock->count) == 1) &&
- (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
- lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx) {
- struct ww_mutex *ww;
- ww = container_of(lock, struct ww_mutex, base);
-
- ww_mutex_set_context_fastpath(ww, ww_ctx);
- }
-
- mutex_set_owner(lock);
- osq_unlock(&lock->osq);
- preempt_enable();
- return 0;
- }
+ if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ /* got the lock, yay! */
+ preempt_enable();
+ return 0;
+ }
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
+ spin_lock_mutex(&lock->wait_lock, flags);
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- arch_mutex_cpu_relax();
- }
- osq_unlock(&lock->osq);
-slowpath:
/*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock the mutex. This avoids getting
- * scheduled out right after we obtained the mutex.
+ * Once more, try to acquire the lock. Only try-lock the mutex if
+ * it is unlocked to reduce unnecessary xchg() operations.
*/
- if (need_resched())
- schedule_preempt_disabled();
-#endif
- spin_lock_mutex(&lock->wait_lock, flags);
-
- /* once more, can we acquire the lock? */
- if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
@@ -506,9 +549,10 @@ slowpath:
* it's unlocked. Later on, if we sleep, this is the
* operation that gives us the lock. We xchg it to -1, so
* that when we release the lock, we properly wake up the
- * other waiters:
+ * other waiters. We only attempt the xchg if the count is
+ * non-negative in order to avoid unnecessary xchg operations:
*/
- if (MUTEX_SHOW_NO_WAITER(lock) &&
+ if (atomic_read(&lock->count) >= 0 &&
(atomic_xchg(&lock->count, -1) == 1))
break;
@@ -522,7 +566,7 @@ slowpath:
}
if (use_ww_ctx && ww_ctx->acquired > 0) {
- ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
if (ret)
goto err;
}
@@ -534,6 +578,8 @@ slowpath:
schedule_preempt_disabled();
spin_lock_mutex(&lock->wait_lock, flags);
}
+ __set_task_state(task, TASK_RUNNING);
+
mutex_remove_waiter(lock, &waiter, current_thread_info());
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -547,23 +593,7 @@ skip_wait:
if (use_ww_ctx) {
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct mutex_waiter *cur;
-
- /*
- * This branch gets optimized out for the common case,
- * and is only important for ww_mutex_lock.
- */
- ww_mutex_lock_acquired(ww, ww_ctx);
- ww->ctx = ww_ctx;
-
- /*
- * Give any possible sleeping processes the chance to wake up,
- * so they can recheck if they have to back off.
- */
- list_for_each_entry(cur, &lock->wait_list, list) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
+ ww_mutex_set_context_slowpath(ww, ww_ctx);
}
spin_unlock_mutex(&lock->wait_lock, flags);
@@ -682,15 +712,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
* Release the lock, slowpath:
*/
static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
/*
- * some architectures leave the lock unlocked in the fastpath failure
+ * As a performance measurement, release the lock before doing other
+ * wakeup related duties to follow. This allows other tasks to acquire
+ * the lock sooner, while still handling cleanups in past unlock calls.
+ * This can be done as we do not enforce strict equivalence between the
+ * mutex counter and wait_list.
+ *
+ *
+ * Some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
- * unlock it here
+ * unlock it here - as the lock counter is currently 0 or negative.
*/
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
@@ -719,7 +755,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
__visible void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
- __mutex_unlock_common_slowpath(lock_count, 1);
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_unlock_common_slowpath(lock, 1);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -823,6 +861,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
unsigned long flags;
int prev;
+ /* No need to trylock if the mutex is locked. */
+ if (mutex_is_locked(lock))
+ return 0;
+
spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..5cda397607f2 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current;
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index be9ee1559fca..dc85ee23a26f 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,10 +1,6 @@
-
#include <linux/percpu.h>
-#include <linux/mutex.h>
#include <linux/sched.h>
-#include "mcs_spinlock.h"
-
-#ifdef CONFIG_SMP
+#include <linux/osq_lock.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -79,7 +75,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
break;
}
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
return next;
@@ -102,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/*
* Normally @prev is untouchable after the above store; because at that
@@ -113,14 +109,14 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!smp_load_acquire(&node->locked)) {
+ while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
*/
if (need_resched())
goto unqueue;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
return true;
@@ -146,13 +142,13 @@ unqueue:
if (smp_load_acquire(&node->locked))
return true;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
/*
* Or we race against a concurrent unqueue()'s step-B, in which
* case its step-C will write us a new @node->prev pointer.
*/
- prev = ACCESS_ONCE(node->prev);
+ prev = READ_ONCE(node->prev);
}
/*
@@ -174,8 +170,8 @@ unqueue:
* it will wait in Step-A.
*/
- ACCESS_ONCE(next->prev) = prev;
- ACCESS_ONCE(prev->next) = next;
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
return false;
}
@@ -197,14 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
return;
}
next = osq_wait_next(lock, node, NULL);
if (next)
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
}
-
-#endif
-
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fb5b8ac411a5..f956ede7f90d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,7 +20,6 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <linux/mutex.h>
#include <asm/qrwlock.h>
/**
@@ -35,7 +34,7 @@ static __always_inline void
rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
cnts = smp_load_acquire((u32 *)&lock->cnts);
}
}
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
* to make sure that the write lock isn't taken.
*/
while (atomic_read(&lock->cnts) & _QW_WMASK)
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
cnts | _QW_WAITING) == cnts))
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
/* When no more readers, set the locked flag */
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
_QW_LOCKED) == _QW_WAITING))
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
unlock:
arch_spin_unlock(&lock->lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
* the deadlock. We print when we return. act_waiter can be NULL in
* case of a remove waiter operation.
*/
-void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *act_waiter,
struct rt_mutex *lock)
{
struct task_struct *task;
- if (!debug_locks || detect || !act_waiter)
+ if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
return;
task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index ab29b6a22669..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
struct task_struct *powner);
extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
struct rt_mutex *lock);
extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
# define debug_rt_mutex_reset_waiter(w) \
do { (w)->deadlock_lock = NULL; } while (0)
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- int detect)
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk walk)
{
return (waiter != NULL);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fc605941b9b8..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
- * See Documentation/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.txt for details.
*/
#include <linux/spinlock.h>
#include <linux/export.h>
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
}
/*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+{
+ /*
+ * This is just a wrapper function for the following call,
+ * because debug_rt_mutex_detect_deadlock() smells like a magic
+ * debug feature and I wanted to keep the cond function in the
+ * main source file along with the comments instead of having
+ * two of the same in the headers.
+ */
+ return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
* Max number of times we'll walk the boosting chain:
*/
int max_lock_depth = 1024;
@@ -323,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
*
* @task: the task owning the mutex (owner) for which a chain walk is
* probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk: do we have to carry out deadlock detection?
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* @top_task: the current top waiter
*
* Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+ * @orig_lock if != NULL @top_task is blocked on it
+ * @next_lock Unprotected. Cannot be
+ * dereferenced. Only used for
+ * comparison.
+ * @orig_waiter if != NULL @top_task is blocked on it
+ * @top_task current, or in case of proxy
+ * locking protected by calling
+ * code
+ * again:
+ * loop_sanity_check();
+ * retry:
+ * [1] lock(task->pi_lock); [R] acquire [P]
+ * [2] waiter = task->pi_blocked_on; [P]
+ * [3] check_exit_conditions_1(); [P]
+ * [4] lock = waiter->lock; [P]
+ * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+ * unlock(task->pi_lock); release [P]
+ * goto retry;
+ * }
+ * [6] check_exit_conditions_2(); [P] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+ * [8] unlock(task->pi_lock); release [P]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+ * lock(task->pi_lock); [L] acquire [P]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12] check_exit_conditions_4(); [P] + [L]
+ * [13] unlock(task->pi_lock); release [P]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
+ enum rtmutex_chainwalk chwalk,
struct rt_mutex *orig_lock,
struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
- struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
- int detect_deadlock, ret = 0, depth = 0;
+ struct rt_mutex_waiter *prerequeue_top_waiter;
+ int ret = 0, depth = 0;
+ struct rt_mutex *lock;
+ bool detect_deadlock;
unsigned long flags;
+ bool requeue = true;
- detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
- deadlock_detect);
+ detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
/*
* The (de)boosting is a step by step approach with a lot of
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* carefully whether things change under us.
*/
again:
+ /*
+ * We limit the lock chain length for each invocation.
+ */
if (++depth > max_lock_depth) {
static int prev_max;
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
return -EDEADLK;
}
+
+ /*
+ * We are fully preemptible here and only hold the refcount on
+ * @task. So everything can have changed under us since the
+ * caller or our own code below (goto retry/again) dropped all
+ * locks.
+ */
retry:
/*
- * Task can not go away as we did a get_task() before !
+ * [1] Task cannot go away as we did a get_task() before !
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /*
+ * [2] Get the waiter on which @task is blocked on.
+ */
waiter = task->pi_blocked_on;
+
+ /*
+ * [3] check_exit_conditions_1() protected by task->pi_lock.
+ */
+
/*
* Check whether the end of the boosting chain has been
* reached or the state of the chain has changed while we
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
* If deadlock detection is off, we stop here if we
- * are not the top pi waiter of the task.
+ * are not the top pi waiter of the task. If deadlock
+ * detection is enabled we continue, but stop the
+ * requeueing in the chain walk.
*/
- if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
- goto out_unlock_pi;
+ if (top_waiter != task_top_pi_waiter(task)) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
}
/*
- * When deadlock detection is off then we check, if further
- * priority adjustment is necessary.
+ * If the waiter priority is the same as the task priority
+ * then there is no further priority adjustment necessary. If
+ * deadlock detection is off, we stop the chain walk. If its
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
*/
- if (!detect_deadlock && waiter->prio == task->prio)
- goto out_unlock_pi;
+ if (waiter->prio == task->prio) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ /*
+ * [4] Get the next lock
+ */
lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
+ */
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
cpu_relax();
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
+ * [6] check_exit_conditions_2() protected by task->pi_lock and
+ * lock->wait_lock.
+ *
* Deadlock detection. If the lock is the same as the original
* lock which caused us to walk the lock chain or if the
* current lock is owned by the task which initiated the chain
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
goto out_unlock_pi;
}
- top_waiter = rt_mutex_top_waiter(lock);
+ /*
+ * If we just follow the lock chain for deadlock detection, no
+ * need to do all the requeue operations. To avoid a truckload
+ * of conditionals around the various places below, just do the
+ * minimum chain walk checks.
+ */
+ if (!requeue) {
+ /*
+ * No requeue[7] here. Just release @task [8]
+ */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ * If there is no owner of the lock, end of chain.
+ */
+ if (!rt_mutex_owner(lock)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * No requeue [11] here. We just do deadlock detection.
+ *
+ * [12] Store whether owner is blocked
+ * itself. Decision is made after dropping the locks
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Get the top waiter for the next iteration
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* If owner is not blocked, end of chain. */
+ if (!next_lock)
+ goto out_put_task;
+ goto again;
+ }
- /* Requeue the waiter */
+ /*
+ * Store the current top waiter before doing the requeue
+ * operation on @lock. We need it for the boost/deboost
+ * decision below.
+ */
+ prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [7] Requeue the waiter in the lock waiter list. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
- /* Release the task */
+ /* [8] Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ *
+ * We must abort the chain walk if there is no lock owner even
+ * in the dead lock detection case, as we have nothing to
+ * follow here. This is the end of the chain we are walking.
+ */
if (!rt_mutex_owner(lock)) {
/*
- * If the requeue above changed the top waiter, then we need
- * to wake the new top waiter up to try to get the lock.
+ * If the requeue [7] above changed the top waiter,
+ * then we need to wake the new top waiter up to try
+ * to get the lock.
*/
-
- if (top_waiter != rt_mutex_top_waiter(lock))
+ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
raw_spin_unlock(&lock->wait_lock);
- goto out_put_task;
+ return 0;
}
- put_task_struct(task);
- /* Grab the next task */
+ /* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
- /* Boost the owner */
- rt_mutex_dequeue_pi(task, top_waiter);
+ /*
+ * The waiter became the new top (highest priority)
+ * waiter on the lock. Replace the previous top waiter
+ * in the owner tasks pi waiters list with this waiter
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
- } else if (top_waiter == waiter) {
- /* Deboost the owner */
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+ * The waiter was the top waiter on the lock, but is
+ * no longer the top prority waiter. Replace waiter in
+ * the owner tasks pi waiters list with the new top
+ * (highest priority) waiter and adjust the priority
+ * of the owner.
+ * The new top waiter is stored in @waiter so that
+ * @waiter == @top_waiter evaluates to true below and
+ * we continue to deboost the rest of the chain.
+ */
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+ * adjustment.
+ */
}
/*
+ * [12] check_exit_conditions_4() protected by task->pi_lock
+ * and lock->wait_lock. The actual decisions are made after we
+ * dropped the locks.
+ *
* Check whether the task which owns the current lock is pi
* blocked itself. If yes we store a pointer to the lock for
* the lock chain change detection above. After we dropped
* task->pi_lock next_lock cannot be dereferenced anymore.
*/
next_lock = task_blocked_on_lock(task);
+ /*
+ * Store the top waiter of @lock for the end of chain walk
+ * decision below.
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+ /* [13] Drop the locks */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- top_waiter = rt_mutex_top_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
/*
+ * Make the actual exit decisions [12], based on the stored
+ * values.
+ *
* We reached the end of the lock chain. Stop right here. No
* point to go back just to figure that out.
*/
if (!next_lock)
goto out_put_task;
+ /*
+ * If the current waiter is not the top waiter on the lock,
+ * then we can stop the chain walk here if we are not in full
+ * deadlock detection mode.
+ */
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*
* Must be called with lock->wait_lock held.
*
- * @lock: the lock to be acquired.
- * @task: the task which wants to acquire the lock
- * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
+ * @lock: The lock to be acquired.
+ * @task: The task which wants to acquire the lock
+ * @waiter: The waiter that is queued to the lock's wait list if the
+ * callsite called task_blocked_on_lock(), otherwise NULL
*/
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter)
{
+ unsigned long flags;
+
/*
- * We have to be careful here if the atomic speedups are
- * enabled, such that, when
- * - no other waiter is on the lock
- * - the lock has been released since we did the cmpxchg
- * the lock can be released or taken while we are doing the
- * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ * Before testing whether we can acquire @lock, we set the
+ * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+ * other tasks which try to modify @lock into the slow path
+ * and they serialize on @lock->wait_lock.
+ *
+ * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+ * as explained at the top of this file if and only if:
*
- * The atomic acquire/release aware variant of
- * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
- * the WAITERS bit, the atomic release / acquire can not
- * happen anymore and lock->wait_lock protects us from the
- * non-atomic case.
+ * - There is a lock owner. The caller must fixup the
+ * transient state if it does a trylock or leaves the lock
+ * function due to a signal or timeout.
*
- * Note, that this might set lock->owner =
- * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
- * any more. This is fixed up when we take the ownership.
- * This is the transitional state explained at the top of this file.
+ * - @task acquires the lock and there are no other
+ * waiters. This is undone in rt_mutex_set_owner(@task) at
+ * the end of this function.
*/
mark_rt_mutex_waiters(lock);
+ /*
+ * If @lock has an owner, give up.
+ */
if (rt_mutex_owner(lock))
return 0;
/*
- * It will get the lock because of one of these conditions:
- * 1) there is no waiter
- * 2) higher priority than waiters
- * 3) it is top waiter
+ * If @waiter != NULL, @task has already enqueued the waiter
+ * into @lock waiter list. If @waiter == NULL then this is a
+ * trylock attempt.
*/
- if (rt_mutex_has_waiters(lock)) {
- if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
- if (!waiter || waiter != rt_mutex_top_waiter(lock))
- return 0;
- }
- }
-
- if (waiter || rt_mutex_has_waiters(lock)) {
- unsigned long flags;
- struct rt_mutex_waiter *top;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (waiter) {
+ /*
+ * If waiter is not the highest priority waiter of
+ * @lock, give up.
+ */
+ if (waiter != rt_mutex_top_waiter(lock))
+ return 0;
- /* remove the queued waiter. */
- if (waiter) {
- rt_mutex_dequeue(lock, waiter);
- task->pi_blocked_on = NULL;
- }
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters list.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
/*
- * We have to enqueue the top waiter(if it exists) into
- * task->pi_waiters list.
+ * If the lock has waiters already we check whether @task is
+ * eligible to take over the lock.
+ *
+ * If there are no other waiters, @task can acquire
+ * the lock. @task->pi_blocked_on is NULL, so it does
+ * not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- top = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(task, top);
+ /*
+ * If @task->prio is greater than or equal to
+ * the top waiter priority (kernel view),
+ * @task lost.
+ */
+ if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ return 0;
+
+ /*
+ * The current top waiter stays enqueued. We
+ * don't have to change anything in the lock
+ * waiters order.
+ */
+ } else {
+ /*
+ * No waiters. Take the lock without the
+ * pi_lock dance.@task->pi_blocked_on is NULL
+ * and we have no waiters to enqueue in @task
+ * pi waiters list.
+ */
+ goto takeit;
}
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
+ /*
+ * Clear @task->pi_blocked_on. Requires protection by
+ * @task->pi_lock. Redundant operation for the @waiter == NULL
+ * case, but conditionals are more expensive than a redundant
+ * store.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ task->pi_blocked_on = NULL;
+ /*
+ * Finish the lock acquisition. @task is the new owner. If
+ * other waiters exist we have to insert the highest priority
+ * waiter into @task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+takeit:
/* We got the lock. */
debug_rt_mutex_lock(lock);
+ /*
+ * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+ * are still waiters or clears it.
+ */
rt_mutex_set_owner(lock, task);
rt_mutex_deadlock_account_lock(lock, task);
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
- } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
}
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+ res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
static void remove_waiter(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter)
{
- int first = (waiter == rt_mutex_top_waiter(lock));
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock = NULL;
+ struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- if (!owner)
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (first) {
-
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
- rt_mutex_dequeue_pi(owner, waiter);
+ rt_mutex_dequeue_pi(owner, waiter);
- if (rt_mutex_has_waiters(lock)) {
- struct rt_mutex_waiter *next;
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- next = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(owner, next);
- }
- __rt_mutex_adjust_prio(owner);
+ __rt_mutex_adjust_prio(owner);
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
if (!next_lock)
return;
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
- rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
}
/**
@@ -873,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
}
+ __set_current_state(TASK_RUNNING);
return ret;
}
@@ -902,7 +1160,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
int ret = 0;
@@ -928,16 +1186,17 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
timeout->task = NULL;
}
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
if (likely(!ret))
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret)) {
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+ __set_current_state(TASK_RUNNING);
+ if (rt_mutex_has_waiters(lock))
+ remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
/*
@@ -960,22 +1219,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/*
* Slow path try-lock function:
*/
-static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
- int ret = 0;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+ /*
+ * The mutex has currently no owner. Lock the wait lock and
+ * try to acquire the lock.
+ */
raw_spin_lock(&lock->wait_lock);
- if (likely(rt_mutex_owner(lock) != current)) {
+ ret = try_to_take_rt_mutex(lock, current, NULL);
- ret = try_to_take_rt_mutex(lock, current, NULL);
- /*
- * try_to_take_rt_mutex() sets the lock waiters
- * bit unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
- }
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -1053,30 +1321,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int detect_deadlock,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, NULL, detect_deadlock);
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout, int detect_deadlock,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+ likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, timeout, detect_deadlock);
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
@@ -1109,54 +1378,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
{
might_sleep();
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
/**
* rt_mutex_lock_interruptible - lock a rt_mutex interruptible
*
- * @lock: the rt_mutex to be locked
- * @detect_deadlock: deadlock detection on/off
+ * @lock: the rt_mutex to be locked
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ * 0 on success
+ * -EINTR when interrupted by a signal
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
- int detect_deadlock)
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
might_sleep();
- return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
- detect_deadlock, rt_mutex_slowlock);
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+/*
+ * Futex variant with full deadlock detection.
+ */
+int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_FULL_CHAINWALK,
+ rt_mutex_slowlock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
* by the caller
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
- * @detect_deadlock: deadlock detection on/off
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * 0 on success
+ * -EINTR when interrupted by a signal
* -ETIMEDOUT when the timeout expired
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
{
might_sleep();
return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- detect_deadlock, rt_mutex_slowlock);
+ RT_MUTEX_MIN_CHAINWALK,
+ rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1262,7 +1538,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
* @lock: the rt_mutex to take
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Returns:
* 0 - task blocked on lock
@@ -1273,7 +1548,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task, int detect_deadlock)
+ struct task_struct *task)
{
int ret;
@@ -1285,7 +1560,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
}
/* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
if (ret && !rt_mutex_owner(lock)) {
/*
@@ -1331,22 +1607,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
* rt_mutex_finish_proxy_lock() - Complete lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
- * been started.
+ * been started.
* @waiter: the pre-initialized rt_mutex_waiter
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Complete the lock acquisition started our behalf by another thread.
*
* Returns:
* 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ * <0 - error, one of -EINTR, -ETIMEDOUT
*
* Special API call for PI-futex requeue support
*/
int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock)
+ struct rt_mutex_waiter *waiter)
{
int ret;
@@ -1354,10 +1628,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret))
remove_waiter(lock, waiter);
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index f6a1f3c133b1..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
#define debug_rt_mutex_init(m, n) do { } while (0)
#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
{
WARN(1, "rtmutex deadlock detected\n");
}
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+ enum rtmutex_chainwalk walk)
+{
+ return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
}
/*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
+ * no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
+ * walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+ RT_MUTEX_MIN_CHAINWALK,
+ RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- int detect_deadlock);
+ struct task_struct *task);
extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock);
+ struct rt_mutex_waiter *waiter);
+extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
list_del(&waiter->list);
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -154,7 +161,7 @@ void __sched __down_read(struct rw_semaphore *sem)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
}
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
out:
;
}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a2391ac135c8..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -242,23 +250,26 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
schedule();
}
- tsk->state = TASK_RUNNING;
-
+ __set_task_state(tsk, TASK_RUNNING);
return sem;
}
+EXPORT_SYMBOL(rwsem_down_read_failed);
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* try acquiring the write lock */
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
- return true;
- }
+ /*
+ * Try acquiring the write lock. Check count first in order
+ * to reduce unnecessary expensive cmpxchg() operations.
+ */
+ if (count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ rwsem_set_owner(sem);
+ return true;
}
+
return false;
}
@@ -268,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = ACCESS_ONCE(sem->count);
+ long old, count = READ_ONCE(sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count)
+ if (old == count) {
+ rwsem_set_owner(sem);
return true;
+ }
count = old;
}
@@ -285,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = false;
+ bool ret = true;
if (need_resched())
return false;
rcu_read_lock();
- owner = ACCESS_ONCE(sem->owner);
- if (owner)
- on_cpu = owner->on_cpu;
- rcu_read_unlock();
-
- /*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath, then there is a possibility reader(s) may have the lock.
- * To be safe, avoid spinning in these situations.
- */
- return on_cpu;
-}
-
-static inline bool owner_running(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- if (sem->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * sem->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
+ owner = READ_ONCE(sem->owner);
+ if (!owner) {
+ long count = READ_ONCE(sem->count);
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath with the lock being active, then there is a possibility
+ * reader(s) may have the lock. To be safe, bail spinning in these
+ * situations.
+ */
+ if (count & RWSEM_ACTIVE_MASK)
+ ret = false;
+ goto done;
+ }
- return owner->on_cpu;
+ ret = owner->on_cpu;
+done:
+ rcu_read_unlock();
+ return ret;
}
static noinline
bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
{
+ long count;
+
rcu_read_lock();
- while (owner_running(sem, owner)) {
- if (need_resched())
- break;
+ while (sem->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
- arch_mutex_cpu_relax();
+ /* abort spinning when need_resched or owner is not running */
+ if (!owner->on_cpu || need_resched()) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ cpu_relax_lowlatency();
}
rcu_read_unlock();
+ if (READ_ONCE(sem->owner))
+ return true; /* new owner, continue spinning */
+
/*
- * We break out the loop above on need_resched() or when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when sem->owner is NULL.
+ * When the owner is not set, the lock could be free or
+ * held by readers. Check the counter to verify the
+ * state.
*/
- return sem->owner == NULL;
+ count = READ_ONCE(sem->count);
+ return (count == 0 || count == RWSEM_WAITING_BIAS);
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -356,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
goto done;
while (true) {
- owner = ACCESS_ONCE(sem->owner);
+ owner = READ_ONCE(sem->owner);
if (owner && !rwsem_spin_on_owner(sem, owner))
break;
@@ -381,7 +401,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
osq_unlock(&sem->osq);
done:
@@ -430,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = ACCESS_ONCE(sem->count);
+ count = READ_ONCE(sem->count);
/*
* If there were already threads queued before us and there are
@@ -465,6 +485,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
return sem;
}
+EXPORT_SYMBOL(rwsem_down_write_failed);
/*
* handle waking up a waiter on the semaphore
@@ -485,6 +506,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
return sem;
}
+EXPORT_SYMBOL(rwsem_wake);
/*
* downgrade a write lock into a read lock
@@ -506,8 +528,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
return sem;
}
-
-EXPORT_SYMBOL(rwsem_down_read_failed);
-EXPORT_SYMBOL(rwsem_down_write_failed);
-EXPORT_SYMBOL(rwsem_wake);
EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rwsem.h>
-
#include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
+#include "rwsem.h"
/*
* lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..b8120abe594b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
-static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
/**
@@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock);
/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
- * @jiffies: how long to wait before failing
+ * @timeout: how long to wait before failing
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long jiffies)
+int down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
@@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies)
if (likely(sem->count > 0))
sem->count--;
else
- result = __down_timeout(sem, jiffies);
+ result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem)
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
}
-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
{
- return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..42a1d2afb217 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -60,7 +59,6 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
-#include <linux/fips.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -99,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -136,7 +134,7 @@ static int param_set_bool_enable_only(const char *val,
}
static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool_enable_only,
.get = param_get_bool,
};
@@ -159,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
* Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-int register_module_notifier(struct notifier_block * nb)
+int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);
-int unregister_module_notifier(struct notifier_block * nb)
+int unregister_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
@@ -389,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms,
pr_warn("Symbol %s is marked as UNUSED, however this module is "
"using it.\n", fsa->name);
pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evalute if this is the right api to use and if "
- "it really is, submit a report the linux kernel "
- "mailinglist together with submitting your code for "
+ pr_warn("Please evaluate if this is the right api to use and "
+ "if it really is, submit a report to the linux kernel "
+ "mailing list together with submitting your code for "
"inclusion.\n");
}
#endif
@@ -629,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_inc(&mod->refcnt);
return 0;
}
@@ -722,8 +725,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -741,60 +742,48 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}
/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
+ mod->state = MODULE_STATE_GOING;
- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod: the module we're checking
+ *
+ * Returns:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -876,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
struct module_use *use;
int printed_something = 0;
- seq_printf(m, " %lu ", module_refcount(mod));
+ seq_printf(m, " %i ", module_refcount(mod));
- /* Always include a trailing , so userspace can differentiate
- between this and the old multi-field proc format. */
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
seq_printf(m, "%s,", use->source->name);
@@ -887,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
if (mod->init != NULL && mod->exit == NULL) {
printed_something = 1;
- seq_printf(m, "[permanent],");
+ seq_puts(m, "[permanent],");
}
if (!printed_something)
- seq_printf(m, "-");
+ seq_puts(m, "-");
}
void __symbol_put(const char *symbol)
@@ -926,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}
static struct module_attribute modinfo_refcnt =
@@ -936,7 +927,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -949,11 +940,11 @@ bool try_module_get(struct module *module)
if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;
preempt_enable();
@@ -964,11 +955,12 @@ EXPORT_SYMBOL(try_module_get);
void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}
@@ -979,7 +971,7 @@ EXPORT_SYMBOL(module_put);
static inline void print_unload_info(struct seq_file *m, struct module *mod)
{
/* We don't know the usage count, or what modules are using. */
- seq_printf(m, " - -");
+ seq_puts(m, " - -");
}
static inline void module_unload_free(struct module *mod)
@@ -1132,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc,
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1166,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs,
return 0;
bad_version:
- printk("%s: disagrees about version of symbol %s\n",
+ pr_warn("%s: disagrees about version of symbol %s\n",
mod->name, symname);
return 0;
}
@@ -1201,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1233,6 +1225,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
const unsigned long *crc;
int err;
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
mutex_lock(&module_mutex);
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1289,15 +1287,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
-struct module_sect_attr
-{
+struct module_sect_attr {
struct module_attribute mattr;
char *name;
unsigned long address;
};
-struct module_sect_attrs
-{
+struct module_sect_attrs {
struct attribute_group grp;
unsigned int nsections;
struct module_sect_attr attrs[0];
@@ -1551,7 +1547,8 @@ static int module_add_modinfo_attrs(struct module *mod)
(attr->test && attr->test(mod))) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
++temp_attr;
}
}
@@ -1567,7 +1564,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
- sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
if (attr->free)
attr->free(mod);
}
@@ -1698,18 +1695,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1825,7 +1810,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
static void unset_module_init_ro_nx(struct module *mod) { }
#endif
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
{
vfree(module_region);
}
@@ -1834,6 +1819,10 @@ void __weak module_arch_cleanup(struct module *mod)
{
}
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1843,7 +1832,9 @@ static void free_module(struct module *mod)
/* We leave it in list to prevent duplicate loads, but make sure
* that noone uses it while it's being deconstructed. */
+ mutex_lock(&module_mutex);
mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1859,21 +1850,27 @@ static void free_module(struct module *mod)
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
- /* Free lock-classes: */
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
unset_module_core_ro_nx(mod);
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -1954,7 +1951,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
- printk("%s: please compile with -fno-common\n",
+ pr_warn("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
break;
@@ -2258,7 +2255,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum)
{
const Elf_Shdr *sec;
@@ -2314,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
mod->core_size += strtab_size;
+ mod->core_size = debug_align(mod->core_size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
info->index.str) | INIT_OFFSET_MASK;
+ mod->init_size = debug_align(mod->init_size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
}
/* Not having a signature is only an error if we're strict. */
- if (err < 0 && fips_enabled)
- panic("Module verification failed with error %d in FIPS mode\n",
- err);
if (err == -ENOKEY && !sig_enforce)
err = 0;
@@ -2483,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
return 0;
}
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -2498,11 +2511,12 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return err;
/* Suck in entire file: we'll want most of it. */
- info->hdr = vmalloc(info->len);
+ info->hdr = __vmalloc(info->len,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
vfree(info->hdr);
return -EFAULT;
}
@@ -2737,7 +2751,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
* This shouldn't happen with same compiler and binutils
* building all parts of the module.
*/
- printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+ pr_warn("%s: has both .ctors and .init_array.\n",
mod->name);
return -EINVAL;
}
@@ -2757,6 +2771,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
+ mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+ sizeof(*mod->trace_enums),
+ &mod->num_trace_enums);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -2811,7 +2828,7 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
return -ENOMEM;
}
memset(ptr, 0, mod->init_size);
@@ -2956,8 +2973,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
- module_free(mod, mod->module_init);
- module_free(mod, mod->module_core);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
+ module_memfree(mod->module_core);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2989,6 +3007,12 @@ static bool finished_loading(const char *name)
struct module *mod;
bool ret;
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -3009,10 +3033,36 @@ static void do_mod_ctors(struct module *mod)
#endif
}
-/* This is where the real work happens */
-static int do_init_module(struct module *mod)
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct rcu_head rcu;
+ void *module_init;
+};
+
+static void do_free_init(struct rcu_head *head)
+{
+ struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+ module_memfree(m->module_init);
+ kfree(m);
+}
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
{
int ret = 0;
+ struct mod_initfree *freeinit;
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->module_init = mod->module_init;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3025,16 +3075,7 @@ static int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
+ goto fail_free_freeinit;
}
if (ret > 0) {
pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3079,15 +3120,35 @@ static int do_init_module(struct module *mod)
mod->strtab = mod->core_strtab;
#endif
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
mod->module_init = NULL;
mod->init_size = 0;
mod->init_ro_size = 0;
mod->init_text_size = 0;
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this with preempt disabled. In all the failure paths,
+ * we call synchronize_rcu/synchronize_sched, but we don't want
+ * to slow down the success path, so use actual RCU here.
+ */
+ call_rcu(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
}
static int may_init_module(void)
@@ -3178,7 +3239,7 @@ out:
static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
- /* Check for magic 'dyndbg' arg */
+ /* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3218,7 +3279,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->sig_ok = info->sig_ok;
if (!mod->sig_ok) {
pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
+ "and/or required key missing - tainting "
"kernel\n", mod->name);
add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
@@ -3308,6 +3369,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
+
+ /* we can't deallocate the module until we clear memory protection */
+ unset_module_init_ro_nx(mod);
+ unset_module_core_ro_nx(mod);
+
ddebug_cleanup:
dynamic_debug_remove(info->debug);
synchronize_sched();
@@ -3323,8 +3389,13 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
module_deallocate(mod, info);
free_copy:
free_copy(info);
@@ -3385,7 +3456,9 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
*/
static inline int is_arm_mapping_symbol(const char *str)
{
- return str[0] == '$' && strchr("atd", str[1])
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
+ return str[0] == '$' && strchr("axtd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
@@ -3448,8 +3521,7 @@ const char *module_address_lookup(unsigned long addr,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
if (modname)
*modname = mod->name;
ret = get_ksymbol(mod, addr, size, offset);
@@ -3473,8 +3545,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3499,8 +3570,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, size, offset);
@@ -3657,8 +3727,8 @@ static int m_show(struct seq_file *m, void *p)
/* Informative for users. */
seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading":
- mod->state == MODULE_STATE_COMING ? "Loading":
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
seq_printf(m, " 0x%pK", mod->module_core);
@@ -3667,7 +3737,7 @@ static int m_show(struct seq_file *m, void *p)
if (mod->taints)
seq_printf(m, " %s", module_flags(mod, buf));
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
return 0;
}
@@ -3764,8 +3834,7 @@ struct module *__module_address(unsigned long addr)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_core(addr, mod)
- || within_module_init(addr, mod))
+ if (within_module(addr, mod))
return mod;
}
return NULL;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
+
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ task_lock(p);
ns = p->nsproxy;
+ p->nsproxy = new;
+ task_unlock(p);
- rcu_assign_pointer(p->nsproxy, new);
-
- if (ns && atomic_dec_and_test(&ns->count)) {
- /*
- * wait for others to get what they want from this nsproxy.
- *
- * cannot release this nsproxy via the call_rcu() since
- * put_mnt_ns() will want to sleep
- */
- synchronize_rcu();
+ if (ns && atomic_dec_and_test(&ns->count))
free_nsproxy(ns);
- }
}
void exit_task_namespaces(struct task_struct *p)
@@ -227,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
- const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_ns *ei;
struct file *file;
+ struct ns_common *ns;
int err;
file = proc_ns_fget(fd);
@@ -239,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = get_proc_ns(file_inode(file));
- ops = ei->ns_ops;
- if (nstype && (ops->type != nstype))
+ ns = get_proc_ns(file_inode(file));
+ if (nstype && (ns->ops->type != nstype))
goto out;
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -250,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
- err = ops->install(new_nsproxy, ei->ns);
+ err = ns->ops->install(new_nsproxy, ns);
if (err) {
free_nsproxy(new_nsproxy);
goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
else
cpumask = pinst->cpumask.pcpu;
- len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
- nr_cpu_ids);
- if (PAGE_SIZE - len < 2)
- len = -EINVAL;
- else
- len += sprintf(buf + len, "\n");
-
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n",
+ nr_cpu_ids, cpumask_bits(cpumask));
mutex_unlock(&pinst->lock);
- return len;
+ return len < PAGE_SIZE ? len : -EINVAL;
}
static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -224,6 +225,8 @@ static const struct tnt tnts[] = {
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
{ TAINT_UNSIGNED_MODULE, 'E', ' ' },
+ { TAINT_SOFTLOCKUP, 'L', ' ' },
+ { TAINT_LIVEPATCH, 'K', ' ' },
};
/**
@@ -243,6 +246,8 @@ static const struct tnt tnts[] = {
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
* 'E' - Unsigned module has been loaded.
+ * 'L' - A soft lockup has previously occurred.
+ * 'K' - Kernel has been live patched.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -426,6 +431,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
if (args)
vprintk(args->fmt, args->args);
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
print_modules();
dump_stack();
print_oops_end_marker();
@@ -483,6 +499,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
static int __init setup_crash_kexec_post_notifiers(char *s)
{
diff --git a/kernel/params.c b/kernel/params.c
index 1e52ca233fd9..a22d6a759b1a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -19,6 +19,7 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/slab.h>
@@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+ if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+ pr_warn("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
+}
+
static int parse_one(char *param,
char *val,
const char *doing,
@@ -104,11 +114,12 @@ static int parse_one(char *param,
return 0;
/* No one handled NULL, so do it here. */
if (!val &&
- !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
+ !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
mutex_lock(&param_lock);
+ param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
return err;
@@ -162,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val)
if (args[i-1] == '"')
args[i-1] = '\0';
}
- if (quoted && args[i-1] == '"')
- args[i-1] = '\0';
}
+ if (quoted && args[i-1] == '"')
+ args[i-1] = '\0';
if (args[i]) {
args[i] = '\0';
@@ -256,6 +267,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -317,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
EXPORT_SYMBOL(param_get_bool);
struct kernel_param_ops param_ops_bool = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
@@ -368,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
EXPORT_SYMBOL(param_set_bint);
struct kernel_param_ops param_ops_bint = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
};
@@ -502,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string);
#define to_module_attr(n) container_of(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
-extern struct kernel_param __start___param[], __stop___param[];
-
struct param_attribute
{
struct module_attribute mattr;
@@ -551,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
mutex_lock(&param_lock);
+ param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
mutex_unlock(&param_lock);
if (!err)
@@ -592,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
const struct kernel_param *kp,
const char *name)
{
- struct module_param_attrs *new;
- struct attribute **attrs;
- int err, num;
+ struct module_param_attrs *new_mp;
+ struct attribute **new_attrs;
+ unsigned int i;
/* We don't bother calling this with invisible parameters. */
BUG_ON(!kp->perm);
if (!mk->mp) {
- num = 0;
- attrs = NULL;
- } else {
- num = mk->mp->num;
- attrs = mk->mp->grp.attrs;
+ /* First allocation. */
+ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
+ if (!mk->mp)
+ return -ENOMEM;
+ mk->mp->grp.name = "parameters";
+ /* NULL-terminated attribute array. */
+ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+ GFP_KERNEL);
+ /* Caller will cleanup via free_module_param_attrs */
+ if (!mk->mp->grp.attrs)
+ return -ENOMEM;
}
- /* Enlarge. */
- new = krealloc(mk->mp,
- sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
- GFP_KERNEL);
- if (!new) {
- kfree(attrs);
- err = -ENOMEM;
- goto fail;
- }
- /* Despite looking like the typical realloc() bug, this is safe.
- * We *want* the old 'attrs' to be freed either way, and we'll store
- * the new one in the success case. */
- attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
- if (!attrs) {
- err = -ENOMEM;
- goto fail_free_new;
- }
+ /* Enlarge allocations. */
+ new_mp = krealloc(mk->mp,
+ sizeof(*mk->mp) +
+ sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ GFP_KERNEL);
+ if (!new_mp)
+ return -ENOMEM;
+ mk->mp = new_mp;
- /* Sysfs wants everything zeroed. */
- memset(new, 0, sizeof(*new));
- memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
- memset(&attrs[num], 0, sizeof(attrs[num]));
- new->grp.name = "parameters";
- new->grp.attrs = attrs;
+ /* Extra pointer for NULL terminator */
+ new_attrs = krealloc(mk->mp->grp.attrs,
+ sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
+ GFP_KERNEL);
+ if (!new_attrs)
+ return -ENOMEM;
+ mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- sysfs_attr_init(&new->attrs[num].mattr.attr);
- new->attrs[num].param = kp;
- new->attrs[num].mattr.show = param_attr_show;
- new->attrs[num].mattr.store = param_attr_store;
- new->attrs[num].mattr.attr.name = (char *)name;
- new->attrs[num].mattr.attr.mode = kp->perm;
- new->num = num+1;
+ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
+ mk->mp->attrs[mk->mp->num].param = kp;
+ mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ /* Do not allow runtime DAC changes to make param writable. */
+ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ else
+ mk->mp->attrs[mk->mp->num].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+ mk->mp->num++;
/* Fix up all the pointers, since krealloc can move us */
- for (num = 0; num < new->num; num++)
- new->grp.attrs[num] = &new->attrs[num].mattr.attr;
- new->grp.attrs[num] = NULL;
-
- mk->mp = new;
+ for (i = 0; i < mk->mp->num; i++)
+ mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
+ mk->mp->grp.attrs[mk->mp->num] = NULL;
return 0;
-
-fail_free_new:
- kfree(new);
-fail:
- mk->mp = NULL;
- return err;
}
#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
- kfree(mk->mp->grp.attrs);
+ if (mk->mp)
+ kfree(mk->mp->grp.attrs);
kfree(mk->mp);
mk->mp = NULL;
}
@@ -684,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod,
if (kparam[i].perm == 0)
continue;
err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
- if (err)
+ if (err) {
+ free_module_param_attrs(&mod->mkobj);
return err;
+ }
params = true;
}
@@ -762,7 +771,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
}
static void __init kernel_add_sysfs_param(const char *name,
- struct kernel_param *kparam,
+ const struct kernel_param *kparam,
unsigned int name_skip)
{
struct module_kobject *mk;
@@ -797,7 +806,7 @@ static void __init kernel_add_sysfs_param(const char *name,
*/
static void __init param_sysfs_builtin(void)
{
- struct kernel_param *kp;
+ const struct kernel_param *kp;
unsigned int name_len;
char modname[MODULE_NAME_LEN];
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..4fd07d5b7baf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .proc_inum = PROC_PID_INIT_INO,
+ .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+ .ns.ops = &pidns_operations,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -179,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
spin_unlock_irq(&pidmap_lock);
kfree(page);
if (unlikely(!map->page))
- break;
+ return -ENOMEM;
}
if (likely(atomic_read(&map->nr_free))) {
for ( ; ; ) {
@@ -207,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
}
pid = mk_pid(pid_ns, map, offset);
}
- return -1;
+ return -EAGAIN;
}
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -298,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
+ int retval = -ENOMEM;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
- goto out;
+ return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
- if (nr < 0)
+ if (IS_ERR_VALUE(nr)) {
+ retval = nr;
goto out_free;
+ }
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
@@ -336,18 +342,18 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
spin_unlock_irq(&pidmap_lock);
-out:
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ put_pid_ns(ns);
+
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
- pid = NULL;
- goto out;
+ return ERR_PTR(retval);
}
void disable_pid_allocation(struct pid_namespace *ns)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_map;
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_map;
+ ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
- /* Ignore SIGCHLD causing any terminated children to autoreap */
+ /*
+ * Ignore SIGCHLD causing any terminated children to autoreap.
+ * This speeds up the namespace shutdown, plus see the comment
+ * below.
+ */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
- /* Firstly reap the EXIT_ZOMBIE children we may have. */
+ /*
+ * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+ * sys_wait4() will also block until our children traced from the
+ * parent namespace are detached and become EXIT_DEAD.
+ */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see free_pid().
+ * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * really care, we could reparent them to the global init. We could
+ * exit and reap ->child_reaper even if it is not the last thread in
+ * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * pid_ns can not go away until proc_kill_sb() drops the reference.
+ *
+ * But this ns can also have other tasks injected by setns()+fork().
+ * Again, ignoring the user visible semantics we do not really need
+ * to wait until they are all reaped, but they can be reparented to
+ * us and thus we need to ensure that pid->child_reaper stays valid
+ * until they all go away. See free_pid()->wake_up_process().
+ *
+ * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+ * if reparented.
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static void *pidns_get(struct task_struct *task)
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
+static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
get_pid_ns(ns);
rcu_read_unlock();
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void pidns_put(void *ns)
+static void pidns_put(struct ns_common *ns)
{
- put_pid_ns(ns);
+ put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = ns;
+ struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int pidns_inum(void *ns)
-{
- struct pid_namespace *pid_ns = ns;
- return pid_ns->proc_inum;
-}
-
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
- .inum = pidns_inum,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
+ select PM
config PM_SLEEP_SMP
def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
-config PM_RUNTIME
- bool "Run-time PM core functionality"
- depends on !IA64_HP_SIM
+config PM
+ bool "Device power management core functionality"
---help---
Enable functionality allowing I/O devices to be put into energy-saving
- (low power) states at run time (or autosuspended) after a specified
- period of inactivity and woken up in response to a hardware-generated
+ (low power) states, for example after a specified period of inactivity
+ (autosuspended), and woken up in response to a hardware-generated
wake-up event or a driver's request.
Hardware support is generally required for this functionality to work
and the bus type drivers of the buses the devices are on are
- responsible for the actual handling of the autosuspend requests and
+ responsible for the actual handling of device suspend requests and
wake-up events.
-config PM
- def_bool y
- depends on PM_SLEEP || PM_RUNTIME
-
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
@@ -253,11 +249,9 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config ARCH_HAS_OPP
- bool
-
config PM_OPP
bool
+ select SRCU
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
@@ -301,10 +295,9 @@ config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
-config PM_GENERIC_DOMAINS_RUNTIME
+config PM_GENERIC_DOMAINS_OF
def_bool y
- depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+ depends on PM_GENERIC_DOMAINS && OF
config CPU_PM
bool
- depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <linux/ktime.h>
#include <trace/events/power.h>
#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
* @nr_pages: Number of memory pages processed between @start and @stop.
* @msg: Additional diagnostic message to print.
*/
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
- unsigned nr_pages, char *msg)
+void swsusp_show_speed(ktime_t start, ktime_t stop,
+ unsigned nr_pages, char *msg)
{
+ ktime_t diff;
u64 elapsed_centisecs64;
unsigned int centisecs;
unsigned int k;
unsigned int kps;
- elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
- /*
- * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
- * it is obvious enough for what went wrong.
- */
- do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ diff = ktime_sub(stop, start);
+ elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
@@ -371,7 +369,6 @@ int hibernation_snapshot(int platform_mode)
}
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +394,6 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
dpm_complete(msg);
@@ -500,15 +496,19 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
- dpm_resume_end(PMSG_RECOVER);
+ /*
+ * The above should either succeed and jump to the new kernel,
+ * or return with an error. Otherwise things are just
+ * undefined, so let's be paranoid.
+ */
+ BUG_ON(!error);
}
+ dpm_resume_end(PMSG_RECOVER);
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -535,7 +535,6 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -579,7 +578,6 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- ftrace_start();
resume_console();
Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8e90f330f139..86e8157a450f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
#include <linux/workqueue.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (pm_states[i].state)
- s += sprintf(s,"%s ", pm_states[i].label);
+ if (pm_states[i])
+ s += sprintf(s,"%s ", pm_states[i]);
#endif
if (hibernation_available())
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_MIN;
- struct pm_sleep_state *s;
+ suspend_state_t state;
#endif
char *p;
int len;
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (s->state && len == strlen(s->label)
- && !strncmp(buf, s->label, len))
- return s->state;
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = pm_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
#endif
return PM_SUSPEND_ON;
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state].state ?
- pm_states[state].label : "error");
+ return sprintf(buf, "%s\n", pm_states[state] ?
+ pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-#ifdef CONFIG_PM_RUNTIME
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)
return pm_wq ? 0 : -ENOMEM;
}
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
static int __init pm_init(void)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,17 +174,12 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
struct timeval;
/* kernel/power/swsusp.c */
-extern void swsusp_show_speed(struct timeval *, struct timeval *,
- unsigned int, char *);
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
-struct pm_sleep_state {
- const char *label;
- suspend_state_t state;
-};
-
/* kernel/power/suspend.c */
-extern struct pm_sleep_state pm_states[];
+extern const char *pm_labels[];
+extern const char *pm_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
while (true) {
todo = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p == current || !freeze_task(p))
continue;
if (!freezer_should_skip(p))
todo++;
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
if (!user_only) {
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = elapsed_msecs64;
if (todo) {
- printk("\n");
- printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
+ pr_cont("\n");
+ pr_err("Freezing of tasks %s after %d.%03d seconds "
"(%d tasks refusing to freeze, wq_busy=%d):\n",
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -93,15 +93,15 @@ static int try_to_freeze_tasks(bool user_only)
if (!wakeup) {
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
&& freezing(p) && !frozen(p))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
}
} else {
- printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
+ pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
}
@@ -129,17 +129,25 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
- printk("Freezing user space processes ... ");
+ pm_wakeup_clear();
+ pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
if (!error) {
- printk("done.");
__usermodehelper_set_disable_depth(UMH_DISABLED);
- oom_killer_disable();
+ pr_cont("done.");
}
- printk("\n");
+ pr_cont("\n");
BUG_ON(in_atomic());
+ /*
+ * Now that the whole userspace is frozen we need to disbale
+ * the OOM killer to disallow any further interference with
+ * killable tasks.
+ */
+ if (!error && !oom_killer_disable())
+ error = -EBUSY;
+
if (error)
thaw_processes();
return error;
@@ -157,13 +165,14 @@ int freeze_kernel_threads(void)
{
int error;
- printk("Freezing remaining freezable tasks ... ");
+ pr_info("Freezing remaining freezable tasks ... ");
+
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
if (!error)
- printk("done.");
+ pr_cont("done.");
- printk("\n");
+ pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -184,17 +193,17 @@ void thaw_processes(void)
oom_killer_enable();
- printk("Restarting tasks ... ");
+ pr_info("Restarting tasks ... ");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -203,7 +212,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- printk("done.\n");
+ pr_cont("done.\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -212,17 +221,17 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- printk("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ... ");
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
schedule();
- printk("done.\n");
+ pr_cont("done.\n");
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
#include <linux/platform_device.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -105,11 +107,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
};
+static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
+static struct pm_qos_constraints memory_bw_constraints = {
+ .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
+ .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .type = PM_QOS_SUM,
+ .notifiers = &memory_bandwidth_notifier,
+};
+static struct pm_qos_object memory_bandwidth_pm_qos = {
+ .constraints = &memory_bw_constraints,
+ .name = "memory_bandwidth",
+};
+
+
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
- &network_throughput_pm_qos
+ &network_throughput_pm_qos,
+ &memory_bandwidth_pm_qos,
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +148,9 @@ static const struct file_operations pm_qos_power_fops = {
/* unlocked internal variant */
static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
+ struct plist_node *node;
+ int total_value = 0;
+
if (plist_head_empty(&c->list))
return c->no_constraint_value;
@@ -140,6 +161,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
case PM_QOS_MAX:
return plist_last(&c->list)->prio;
+ case PM_QOS_SUM:
+ plist_for_each(node, &c->list)
+ total_value += node->prio;
+
+ return total_value;
+
default:
/* runtime check for not using enum */
BUG();
@@ -157,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+ struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+ struct pm_qos_constraints *c;
+ struct pm_qos_request *req;
+ char *type;
+ unsigned long flags;
+ int tot_reqs = 0;
+ int active_reqs = 0;
+
+ if (IS_ERR_OR_NULL(qos)) {
+ pr_err("%s: bad qos param!\n", __func__);
+ return -EINVAL;
+ }
+ c = qos->constraints;
+ if (IS_ERR_OR_NULL(c)) {
+ pr_err("%s: Bad constraints on qos?\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Lock to ensure we have a snapshot */
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ if (plist_head_empty(&c->list)) {
+ seq_puts(s, "Empty!\n");
+ goto out;
+ }
+
+ switch (c->type) {
+ case PM_QOS_MIN:
+ type = "Minimum";
+ break;
+ case PM_QOS_MAX:
+ type = "Maximum";
+ break;
+ case PM_QOS_SUM:
+ type = "Sum";
+ break;
+ default:
+ type = "Unknown";
+ }
+
+ plist_for_each_entry(req, &c->list, node) {
+ char *state = "Default";
+
+ if ((req->node).prio != c->default_value) {
+ active_reqs++;
+ state = "Active";
+ }
+ tot_reqs++;
+ seq_printf(s, "%d: %d: %s\n", tot_reqs,
+ (req->node).prio, state);
+ }
+
+ seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+ type, pm_qos_get_value(c), active_reqs, tot_reqs);
+
+out:
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return 0;
+}
+
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pm_qos_dbg_show_requests,
+ inode->i_private);
+}
+
+static const struct file_operations pm_qos_debug_fops = {
+ .open = pm_qos_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/**
* pm_qos_update_target - manages the constraints list and calls the notifiers
* if needed
@@ -484,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
{
qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+ if (d) {
+ (void)debugfs_create_file(qos->name, S_IRUGO, d,
+ (void *)qos, &pm_qos_debug_fops);
+ }
+
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -583,11 +690,16 @@ static int __init pm_qos_power_init(void)
{
int ret = 0;
int i;
+ struct dentry *d;
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+ d = debugfs_create_dir("pm_qos", NULL);
+ if (IS_ERR_OR_NULL(d))
+ d = NULL;
+
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
- ret = register_pm_qos_misc(pm_qos_array[i]);
+ ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
printk(KERN_ERR "pm_qos_param: %s setup failed\n",
pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/compiler.h>
+#include <linux/ktime.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -248,33 +249,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
* information is stored (in the form of a block of bitmap)
* It also contains the pfns that correspond to the start and end of
* the represented memory area.
+ *
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
+ *
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
+ *
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
- struct list_head hook; /* hook into a list of bitmap blocks */
- unsigned long start_pfn; /* pfn represented by the first bit */
- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
- unsigned long *data; /* bitmap representing pages */
+/*
+ * struct rtree_node is a wrapper struct to link the nodes
+ * of the rtree together for easy linear iteration over
+ * bits and easy freeing
+ */
+struct rtree_node {
+ struct list_head list;
+ unsigned long *data;
};
-static inline unsigned long bm_block_bits(struct bm_block *bb)
-{
- return bb->end_pfn - bb->start_pfn;
-}
+/*
+ * struct mem_zone_bm_rtree represents a bitmap used for one
+ * populated memory zone.
+ */
+struct mem_zone_bm_rtree {
+ struct list_head list; /* Link Zones together */
+ struct list_head nodes; /* Radix Tree inner nodes */
+ struct list_head leaves; /* Radix Tree leaves */
+ unsigned long start_pfn; /* Zone start page frame */
+ unsigned long end_pfn; /* Zone end page frame + 1 */
+ struct rtree_node *rtree; /* Radix Tree Root */
+ int levels; /* Number of Radix Tree Levels */
+ unsigned int blocks; /* Number of Bitmap Blocks */
+};
/* strcut bm_position is used for browsing memory bitmaps */
struct bm_position {
- struct bm_block *block;
- int bit;
+ struct mem_zone_bm_rtree *zone;
+ struct rtree_node *node;
+ unsigned long node_pfn;
+ int node_bit;
};
struct memory_bitmap {
- struct list_head blocks; /* list of bitmap blocks */
+ struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
@@ -284,38 +313,178 @@ struct memory_bitmap {
/* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+
+/*
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ struct list_head *list)
{
- bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
- bm->cur.bit = 0;
-}
+ struct rtree_node *node;
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+ node = chain_alloc(ca, sizeof(struct rtree_node));
+ if (!node)
+ return NULL;
-/**
- * create_bm_block_list - create a list of block bitmap objects
- * @pages - number of pages to track
- * @list - list to put the allocated blocks into
- * @ca - chain allocator to be used for allocating memory
+ node->data = get_image_page(gfp_mask, safe_needed);
+ if (!node->data)
+ return NULL;
+
+ list_add_tail(&node->list, list);
+
+ return node;
+}
+
+/*
+ * add_rtree_block - Add a new leave node to the radix tree
+ *
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
-static int create_bm_block_list(unsigned long pages,
- struct list_head *list,
- struct chain_allocator *ca)
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
+ int safe_needed, struct chain_allocator *ca)
{
- unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+ struct rtree_node *node, *block, **dst;
+ unsigned int levels_needed, block_nr;
+ int i;
+
+ block_nr = zone->blocks;
+ levels_needed = 0;
- while (nr_blocks-- > 0) {
- struct bm_block *bb;
+ /* How many levels do we need for this block nr? */
+ while (block_nr) {
+ levels_needed += 1;
+ block_nr >>= BM_RTREE_LEVEL_SHIFT;
+ }
- bb = chain_alloc(ca, sizeof(struct bm_block));
- if (!bb)
+ /* Make sure the rtree has enough levels */
+ for (i = zone->levels; i < levels_needed; i++) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
return -ENOMEM;
- list_add(&bb->hook, list);
+
+ node->data[0] = (unsigned long)zone->rtree;
+ zone->rtree = node;
+ zone->levels += 1;
+ }
+
+ /* Allocate new block */
+ block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+ if (!block)
+ return -ENOMEM;
+
+ /* Now walk the rtree to insert the block */
+ node = zone->rtree;
+ dst = &zone->rtree;
+ block_nr = zone->blocks;
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ if (!node) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+ *dst = node;
+ }
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ dst = (struct rtree_node **)&((*dst)->data[index]);
+ node = *dst;
}
+ zone->blocks += 1;
+ *dst = block;
+
return 0;
}
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free);
+
+/*
+ * create_zone_bm_rtree - create a radix tree for one zone
+ *
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start, unsigned long end)
+{
+ struct mem_zone_bm_rtree *zone;
+ unsigned int i, nr_blocks;
+ unsigned long pages;
+
+ pages = end - start;
+ zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+ if (!zone)
+ return NULL;
+
+ INIT_LIST_HEAD(&zone->nodes);
+ INIT_LIST_HEAD(&zone->leaves);
+ zone->start_pfn = start;
+ zone->end_pfn = end;
+ nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+ free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+ return NULL;
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ free_image_page(node->data, clear_nosave_free);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ free_image_page(node->data, clear_nosave_free);
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
struct mem_extent {
struct list_head hook;
unsigned long start;
@@ -407,40 +576,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
int error;
chain_init(&ca, gfp_mask, safe_needed);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
error = create_mem_extents(&mem_extents, gfp_mask);
if (error)
return error;
list_for_each_entry(ext, &mem_extents, hook) {
- struct bm_block *bb;
- unsigned long pfn = ext->start;
- unsigned long pages = ext->end - ext->start;
-
- bb = list_entry(bm->blocks.prev, struct bm_block, hook);
+ struct mem_zone_bm_rtree *zone;
- error = create_bm_block_list(pages, bm->blocks.prev, &ca);
- if (error)
+ zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+ ext->start, ext->end);
+ if (!zone) {
+ error = -ENOMEM;
goto Error;
-
- list_for_each_entry_continue(bb, &bm->blocks, hook) {
- bb->data = get_image_page(gfp_mask, safe_needed);
- if (!bb->data) {
- error = -ENOMEM;
- goto Error;
- }
-
- bb->start_pfn = pfn;
- if (pages >= BM_BITS_PER_BLOCK) {
- pfn += BM_BITS_PER_BLOCK;
- pages -= BM_BITS_PER_BLOCK;
- } else {
- /* This is executed only once in the loop */
- pfn += pages;
- }
- bb->end_pfn = pfn;
}
+ list_add_tail(&zone->list, &bm->zones);
}
bm->p_list = ca.chain;
@@ -460,51 +611,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
*/
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *zone;
- list_for_each_entry(bb, &bm->blocks, hook)
- if (bb->data)
- free_image_page(bb->data, clear_nosave_free);
+ list_for_each_entry(zone, &bm->zones, list)
+ free_zone_bm_rtree(zone, clear_nosave_free);
free_list_of_pages(bm->p_list, clear_nosave_free);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
}
/**
- * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
- * to given pfn. The cur_zone_bm member of @bm and the cur_block member
- * of @bm->cur_zone_bm are updated.
+ * memory_bm_find_bit - Find the bit for pfn in the memory
+ * bitmap
+ *
+ * Find the bit in the bitmap @bm that corresponds to given pfn.
+ * The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * updated.
+ * It walks the radix tree to find the page which contains the bit for
+ * pfn and returns the bit position in **addr and *bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
- void **addr, unsigned int *bit_nr)
+ void **addr, unsigned int *bit_nr)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *curr, *zone;
+ struct rtree_node *node;
+ int i, block_nr;
+
+ zone = bm->cur.zone;
+
+ if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+ goto zone_found;
+ zone = NULL;
+
+ /* Find the right zone */
+ list_for_each_entry(curr, &bm->zones, list) {
+ if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+ zone = curr;
+ break;
+ }
+ }
+
+ if (!zone)
+ return -EFAULT;
+
+zone_found:
/*
- * Check if the pfn corresponds to the current bitmap block and find
- * the block where it fits if this is not the case.
+ * We have a zone. Now walk the radix tree to find the leave
+ * node for our pfn.
*/
- bb = bm->cur.block;
- if (pfn < bb->start_pfn)
- list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn)
- break;
- if (pfn >= bb->end_pfn)
- list_for_each_entry_continue(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
- break;
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ goto node_found;
- if (&bb->hook == &bm->blocks)
- return -EFAULT;
+ node = zone->rtree;
+ block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ BUG_ON(node->data[index] == 0);
+ node = (struct rtree_node *)node->data[index];
+ }
+
+node_found:
+ /* Update last position */
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+
+ /* Set return values */
+ *addr = node->data;
+ *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
- /* The block has been found */
- bm->cur.block = bb;
- pfn -= bb->start_pfn;
- bm->cur.bit = pfn + 1;
- *bit_nr = pfn;
- *addr = bb->data;
return 0;
}
@@ -528,6 +711,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
+
return error;
}
@@ -542,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+ int bit;
+
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -561,38 +753,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
-/**
- * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
- * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
- * returned.
+/*
+ * rtree_next_node - Jumps to the next leave node
*
- * It is required to run memory_bm_position_reset() before the first call to
- * this function.
+ * Sets the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
+ *
+ * Returns true if there is a next node, false otherwise.
*/
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
+ touch_softlockup_watchdog();
+ return true;
+ }
+ /* No more nodes, goto next zone */
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ struct mem_zone_bm_rtree, list);
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+ return true;
+ }
+
+ /* No more zones */
+ return false;
+}
+
+/**
+ * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ * Starting from the last returned position this function searches
+ * for the next set bit in the memory bitmap and returns its
+ * number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ * It is required to run memory_bm_position_reset() before the
+ * first call to this function.
+ */
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
- struct bm_block *bb;
+ unsigned long bits, pfn, pages;
int bit;
- bb = bm->cur.block;
do {
- bit = bm->cur.bit;
- bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
- if (bit < bm_block_bits(bb))
- goto Return_pfn;
-
- bb = list_entry(bb->hook.next, struct bm_block, hook);
- bm->cur.block = bb;
- bm->cur.bit = 0;
- } while (&bb->hook != &bm->blocks);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
+ if (bit < bits) {
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
+ return pfn;
+ }
+ } while (rtree_next_node(bm));
- memory_bm_position_reset(bm);
return BM_END_OF_MAP;
-
- Return_pfn:
- bm->cur.bit = bit + 1;
- return bb->start_pfn + bit;
}
/**
@@ -816,12 +1040,17 @@ void free_basic_memory_bitmaps(void)
unsigned int snapshot_additional_pages(struct zone *zone)
{
- unsigned int res;
+ unsigned int rtree, nodes;
+
+ rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+ LINKED_PAGE_DATA_SIZE);
+ while (nodes > 1) {
+ nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+ rtree += nodes;
+ }
- res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block),
- LINKED_PAGE_DATA_SIZE);
- return 2 * res;
+ return 2 * rtree;
}
#ifdef CONFIG_HIGHMEM
@@ -1094,23 +1323,39 @@ static struct memory_bitmap copy_bm;
void swsusp_free(void)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long fb_pfn, fr_pfn;
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (swsusp_page_is_forbidden(page) &&
- swsusp_page_is_free(page)) {
- swsusp_unset_page_forbidden(page);
- swsusp_unset_page_free(page);
- __free_page(page);
- }
- }
+ if (!forbidden_pages_map || !free_pages_map)
+ goto out;
+
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ __free_page(page);
+ goto loop;
}
+
+out:
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
@@ -1208,9 +1453,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
/**
* free_unnecessary_pages - Release preallocated pages not needed for the image
*/
-static void free_unnecessary_pages(void)
+static unsigned long free_unnecessary_pages(void)
{
- unsigned long save, to_free_normal, to_free_highmem;
+ unsigned long save, to_free_normal, to_free_highmem, free;
save = count_data_pages();
if (alloc_normal >= save) {
@@ -1231,6 +1476,7 @@ static void free_unnecessary_pages(void)
else
to_free_normal = 0;
}
+ free = to_free_normal + to_free_highmem;
memory_bm_position_reset(&copy_bm);
@@ -1254,6 +1500,8 @@ static void free_unnecessary_pages(void)
swsusp_unset_page_free(page);
__free_page(page);
}
+
+ return free;
}
/**
@@ -1313,11 +1561,11 @@ int hibernate_preallocate_memory(void)
struct zone *zone;
unsigned long saveable, size, max_size, count, highmem, pages = 0;
unsigned long alloc, save_highmem, pages_highmem, avail_normal;
- struct timeval start, stop;
+ ktime_t start, stop;
int error;
printk(KERN_INFO "PM: Preallocating image memory... ");
- do_gettimeofday(&start);
+ start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
if (error)
@@ -1443,12 +1691,12 @@ int hibernate_preallocate_memory(void)
* pages in memory, but we have allocated more. Release the excessive
* ones now.
*/
- free_unnecessary_pages();
+ pages -= free_unnecessary_pages();
out:
- do_gettimeofday(&stop);
+ stop = ktime_get();
printk(KERN_CONT "done (allocated %lu pages)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Allocated");
+ swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
@@ -2046,8 +2294,6 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static inline int get_safe_write_buffer(void) { return 0; }
-
static unsigned int
count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ed35a4790afe..8d7a1ef72758 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,25 +28,19 @@
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
+#include <linux/moduleparam.h>
#include "power.h"
-struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
- [PM_SUSPEND_STANDBY] = { .label = "standby", },
- [PM_SUSPEND_MEM] = { .label = "mem", },
-};
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
-
-static bool need_suspend_ops(suspend_state_t state)
-{
- return state > PM_SUSPEND_FREEZE;
-}
-
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
void freeze_set_ops(const struct platform_freeze_ops *ops)
{
@@ -57,22 +51,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
static void freeze_begin(void)
{
- suspend_freeze_wake = false;
+ suspend_freeze_state = FREEZE_STATE_NONE;
}
static void freeze_enter(void)
{
- cpuidle_use_deepest_state(true);
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
cpuidle_resume();
- wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
cpuidle_pause();
- cpuidle_use_deepest_state(false);
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
}
void freeze_wake(void)
{
- suspend_freeze_wake = true;
- wake_up(&suspend_freeze_wait_head);
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
}
EXPORT_SYMBOL_GPL(freeze_wake);
@@ -97,10 +118,7 @@ static bool relative_states;
static int __init sleep_states_setup(char *str)
{
relative_states = !strncmp(str, "1", 1);
- if (relative_states) {
- pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
- pm_states[PM_SUSPEND_FREEZE].state = 0;
- }
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
return 1;
}
@@ -113,20 +131,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
suspend_state_t i;
- int j = PM_SUSPEND_MAX - 1;
+ int j = 0;
lock_system_sleep();
suspend_ops = ops;
for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i))
- pm_states[j--].state = i;
- else if (!relative_states)
- pm_states[j--].state = 0;
+ if (valid_state(i)) {
+ pm_states[i] = pm_labels[j++];
+ } else if (!relative_states) {
+ pm_states[i] = NULL;
+ j++;
+ }
- pm_states[j--].state = PM_SUSPEND_FREEZE;
- while (j >= PM_SUSPEND_MIN)
- pm_states[j--].state = 0;
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
unlock_system_sleep();
}
@@ -145,12 +163,91 @@ int suspend_valid_only_mem(suspend_state_t state)
}
EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+
+static int platform_suspend_prepare(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ suspend_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
+ freeze_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ suspend_ops->prepare_late() : 0;
+}
+
+static void platform_resume_noirq(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ suspend_ops->wake();
+}
+
+static void platform_resume_early(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
+ freeze_ops->restore();
+}
+
+static void platform_resume_finish(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ suspend_ops->finish();
+}
+
+static int platform_suspend_begin(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+ return freeze_ops->begin();
+ else if (suspend_ops->begin)
+ return suspend_ops->begin(state);
+ else
+ return 0;
+}
+
+static void platform_resume_end(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+ else if (suspend_ops->end)
+ suspend_ops->end();
+}
+
+static void platform_recover(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ suspend_ops->recover();
+}
+
+static bool platform_suspend_again(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ suspend_ops->suspend_again() : false;
+}
+
+#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from suspend test");
+#endif
+
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
@@ -168,7 +265,7 @@ static int suspend_prepare(suspend_state_t state)
{
int error;
- if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+ if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
@@ -214,23 +311,27 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (need_suspend_ops(state) && suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- goto Platform_finish;
- }
+ error = platform_suspend_prepare(state);
+ if (error)
+ goto Platform_finish;
- error = dpm_suspend_end(PMSG_SUSPEND);
+ error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
+ printk(KERN_ERR "PM: late suspend of devices failed\n");
goto Platform_finish;
}
+ error = platform_suspend_prepare_late(state);
+ if (error)
+ goto Devices_early_resume;
- if (need_suspend_ops(state) && suspend_ops->prepare_late) {
- error = suspend_ops->prepare_late();
- if (error)
- goto Platform_wake;
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ goto Platform_early_resume;
}
+ error = platform_suspend_prepare_noirq(state);
+ if (error)
+ goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -248,7 +349,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -275,18 +375,19 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
- ftrace_start();
Platform_wake:
- if (need_suspend_ops(state) && suspend_ops->wake)
- suspend_ops->wake();
+ platform_resume_noirq(state);
+ dpm_resume_noirq(PMSG_RESUME);
- dpm_resume_start(PMSG_RESUME);
+ Platform_early_resume:
+ platform_resume_early(state);
- Platform_finish:
- if (need_suspend_ops(state) && suspend_ops->finish)
- suspend_ops->finish();
+ Devices_early_resume:
+ dpm_resume_early(PMSG_RESUME);
+ Platform_finish:
+ platform_resume_finish(state);
return error;
}
@@ -299,18 +400,13 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (need_suspend_ops(state) && !suspend_ops)
+ if (!sleep_state_supported(state))
return -ENOSYS;
- if (need_suspend_ops(state) && suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
- error = freeze_ops->begin();
- if (error)
- goto Close;
- }
+ error = platform_suspend_begin(state);
+ if (error)
+ goto Close;
+
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
@@ -324,25 +420,22 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup && need_suspend_ops(state)
- && suspend_ops->suspend_again && suspend_ops->suspend_again());
+ } while (!error && !wakeup && platform_suspend_again(state));
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ trace_suspend_resume(TPS("resume_console"), state, true);
resume_console();
- Close:
- if (need_suspend_ops(state) && suspend_ops->end)
- suspend_ops->end();
- else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ trace_suspend_resume(TPS("resume_console"), state, false);
+ Close:
+ platform_resume_end(state);
return error;
Recover_platform:
- if (need_suspend_ops(state) && suspend_ops->recover)
- suspend_ops->recover();
+ platform_recover(state);
goto Resume_devices;
}
@@ -395,7 +488,7 @@ static int enter_state(suspend_state_t state)
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -404,7 +497,7 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
void suspend_test_start(void)
{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
int status;
/* this may fail if the RTC hasn't been initialized */
+repeat:
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
@@ -92,18 +95,29 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
+ if (status < 0)
+ state = PM_SUSPEND_FREEZE;
}
+ if (state == PM_SUSPEND_FREEZE) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+
if (status < 0)
printk(err_suspend, status);
+ test_repeat_count_current++;
+ if (test_repeat_count_current < test_repeat_count_max)
+ goto repeat;
+
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
@@ -129,24 +143,36 @@ static int __init has_wakealarm(struct device *dev, const void *data)
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
- suspend_state_t i;
+ int i;
+ char *repeat;
+ char *suspend_type;
- /* "=mem" ==> "mem" */
+ /* example : "=mem[,N]" ==> "mem[,N]" */
value++;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (!strcmp(pm_states[i].label, value)) {
- test_state = pm_states[i].state;
+ suspend_type = strsep(&value, ",");
+ if (!suspend_type)
+ return 0;
+
+ repeat = strsep(&value, ",");
+ if (repeat) {
+ if (kstrtou32(repeat, 0, &test_repeat_count_max))
+ return 0;
+ }
+
+ for (i = 0; pm_labels[i]; i++)
+ if (!strcmp(pm_labels[i], suspend_type)) {
+ test_state_label = pm_labels[i];
return 0;
}
- printk(warn_bad_state, value);
+ printk(warn_bad_state, suspend_type);
return 0;
}
__setup("test_suspend", setup_test_suspend);
@@ -158,13 +184,21 @@ static int __init test_suspend(void)
struct rtc_device *rtc = NULL;
struct device *dev;
+ suspend_state_t test_state;
/* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!pm_states[test_state].state) {
- printk(warn_bad_state, pm_states[test_state].label);
- goto done;
+ if (!test_state_label)
+ return 0;
+
+ for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+ const char *state_label = pm_states[test_state];
+
+ if (state_label && !strcmp(test_state_label, state_label))
+ break;
+ }
+ if (test_state == PM_SUSPEND_MAX) {
+ printk(warn_bad_state, test_state_label);
+ return 0;
}
/* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +207,12 @@ static int __init test_suspend(void)
rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
- goto done;
+ return 0;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
-done:
return 0;
}
late_initcall(test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
#include <linux/atomic.h>
#include <linux/kthread.h>
#include <linux/crc32.h>
+#include <linux/ktime.h>
#include "power.h"
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
size_t off;
unsigned thr, run_threads, nr_threads;
unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
out_finish:
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
{
unsigned int m;
int ret = 0;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
struct bio *bio;
int err2;
unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
return ret;
}
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int eof = 0;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
unsigned nr_pages;
size_t off;
unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
wait_event(crc->done, atomic_read(&crc->stop));
atomic_set(&crc->stop, 0);
}
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret) {
printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
}
}
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
kthread_stop(data[thr].thr);
vfree(data);
}
- if (page) vfree(page);
+ vfree(page);
return ret;
}
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index cbd69d842341..2ca4a8b5fe57 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -3,7 +3,7 @@
struct console_cmdline
{
- char name[8]; /* Name of the driver */
+ char name[16]; /* Name of the driver */
int index; /* Minor dev. to use */
char *options; /* Options for the driver */
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..c099b082cd02 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,6 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -45,6 +44,8 @@
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
@@ -56,14 +57,11 @@
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
- DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
+ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
-/* Deferred messaged from sched code are marked by this special level */
-#define SCHED_MESSAGE_LOGLEVEL -2
-
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -113,9 +111,9 @@ static int __down_trylock_console_sem(unsigned long ip)
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
- * locked without the console sempahore held
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console sempahore held).
*/
static int console_locked, console_suspended;
@@ -146,8 +144,8 @@ static int console_may_schedule;
* the overall length of the record.
*
* The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
- * are stored..
+ * sequence numbers of these entries are maintained when messages are
+ * stored.
*
* If the heads indicate available messages, the length in the header
* tells the start next message. A length == 0 for the next message
@@ -257,7 +255,7 @@ static u64 clear_seq;
static u32 clear_idx;
#define PREFIX_MAX 32
-#define LOG_LINE_MAX 1024 - PREFIX_MAX
+#define LOG_LINE_MAX (1024 - PREFIX_MAX)
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -270,6 +268,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+ return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+ return log_buf_len;
+}
+
/* human readable text of the record */
static char *log_text(const struct printk_log *msg)
{
@@ -344,7 +354,7 @@ static int log_make_free_space(u32 msg_size)
while (log_first_seq < log_next_seq) {
if (logbuf_has_space(msg_size, false))
return 0;
- /* drop old messages until we have enough continuous space */
+ /* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
@@ -453,11 +463,7 @@ static int log_store(int facility, int level,
return msg->text_len;
}
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
{
@@ -471,7 +477,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-static int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, bool from_file)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -509,14 +515,13 @@ struct devkmsg_user {
char buf[8192];
};
-static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
- unsigned long count, loff_t pos)
+static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
- size_t len = iov_length(iv, count);
+ size_t len = iov_iter_count(from);
ssize_t ret = len;
if (len > LOG_LINE_MAX)
@@ -525,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
if (buf == NULL)
return -ENOMEM;
- line = buf;
- for (i = 0; i < count; i++) {
- if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
- ret = -EFAULT;
- goto out;
- }
- line += iv[i].iov_len;
+ buf[len] = '\0';
+ if (copy_from_iter(buf, len, from) != len) {
+ kfree(buf);
+ return -EFAULT;
}
/*
@@ -557,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
line = endp;
}
}
- line[len] = '\0';
printk_emit(facility, level, NULL, 0, "%s", line);
-out:
kfree(buf);
return ret;
}
@@ -792,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
const struct file_operations kmsg_fops = {
.open = devkmsg_open,
.read = devkmsg_read,
- .aio_write = devkmsg_writev,
+ .write_iter = devkmsg_write,
.llseek = devkmsg_llseek,
.poll = devkmsg_poll,
.release = devkmsg_release,
@@ -828,34 +828,80 @@ void log_buf_kexec_setup(void)
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
-static int __init log_buf_len_setup(char *str)
+/* we practice scaling the ring buffer by powers of 2 */
+static void __init log_buf_len_update(unsigned size)
{
- unsigned size = memparse(str, &str);
-
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
new_log_buf_len = size;
+}
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ log_buf_len_update(size);
return 0;
}
early_param("log_buf_len", log_buf_len_setup);
+#ifdef CONFIG_SMP
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
+
+static void __init log_buf_add_cpu(void)
+{
+ unsigned int cpu_extra;
+
+ /*
+ * archs should set up cpu_possible_bits properly with
+ * set_cpu_possible() after setup_arch() but just in
+ * case lets ensure this is valid.
+ */
+ if (num_possible_cpus() == 1)
+ return;
+
+ cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+
+ /* by default this will only continue through for large > 64 CPUs */
+ if (cpu_extra <= __LOG_BUF_LEN / 2)
+ return;
+
+ pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+ __LOG_CPU_MAX_BUF_LEN);
+ pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+ cpu_extra);
+ pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+
+ log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+#else /* !CONFIG_SMP */
+static inline void log_buf_add_cpu(void) {}
+#endif /* CONFIG_SMP */
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
int free;
+ if (log_buf != __log_buf)
+ return;
+
+ if (!early && !new_log_buf_len)
+ log_buf_add_cpu();
+
if (!new_log_buf_len)
return;
if (early) {
new_log_buf =
- memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+ memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
} else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ LOG_ALIGN);
}
if (unlikely(!new_log_buf)) {
@@ -872,7 +918,7 @@ void __init setup_log_buf(int early)
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
}
@@ -881,7 +927,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
- ignore_loglevel = 1;
+ ignore_loglevel = true;
pr_info("debug: ignoring loglevel setting.\n");
return 0;
@@ -889,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
- "print all kernel messages to the console.");
+MODULE_PARM_DESC(ignore_loglevel,
+ "ignore loglevel setting (prints all kernel messages to the console)");
#ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -947,11 +993,7 @@ static inline void boot_delay_msec(int level)
}
#endif
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
static size_t print_time(u64 ts, char *buf)
@@ -1214,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
- static int saved_console_loglevel = -1;
+ static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
error = check_syslog_permissions(type, from_file);
@@ -1271,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
- if (saved_console_loglevel == -1)
+ if (saved_console_loglevel == LOGLEVEL_DEFAULT)
saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
/* Enable logging to console */
case SYSLOG_ACTION_CONSOLE_ON:
- if (saved_console_loglevel != -1) {
+ if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
console_loglevel = saved_console_loglevel;
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
}
break;
/* Set level of messages printed to console */
@@ -1291,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
error = 0;
break;
/* Number of chars in the log buffer */
@@ -1310,7 +1352,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_idx - syslog_idx;
+ error = log_next_seq - syslog_seq;
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
@@ -1377,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
}
/*
- * Zap console related locks when oopsing. Only zap at most once
- * every 10 seconds, to leave time for slow consoles to print a
- * full oops.
+ * Zap console related locks when oopsing.
+ * To leave time for slow consoles to print a full oops,
+ * only zap at most once every 30 seconds.
*/
static void zap_locks(void)
{
static unsigned long oops_timestamp;
if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
return;
oops_timestamp = jiffies;
@@ -1416,10 +1458,9 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1432,8 +1473,10 @@ static inline int can_use_console(unsigned int cpu)
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*/
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(void)
{
+ unsigned int cpu = smp_processor_id();
+
if (!console_trylock())
return 0;
/*
@@ -1476,7 +1519,7 @@ static struct cont {
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
- u8 facility; /* log level of first message */
+ u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
@@ -1581,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
int printed_len = 0;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
- static volatile unsigned int logbuf_cpu = UINT_MAX;
+ static unsigned int logbuf_cpu = UINT_MAX;
- if (level == SCHED_MESSAGE_LOGLEVEL) {
- level = -1;
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
in_sched = true;
}
@@ -1608,7 +1651,8 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
- goto out_restore_irqs;
+ local_irq_restore(flags);
+ return 0;
}
zap_locks();
}
@@ -1617,27 +1661,22 @@ asmlinkage int vprintk_emit(int facility, int level,
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
- if (recursion_bug) {
+ if (unlikely(recursion_bug)) {
static const char recursion_msg[] =
"BUG: recent printk recursion!";
recursion_bug = 0;
- text_len = strlen(recursion_msg);
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg, text_len);
+ NULL, 0, recursion_msg,
+ strlen(recursion_msg));
}
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
*/
- if (in_sched)
- text_len = scnprintf(text, sizeof(textbuf),
- KERN_WARNING "[sched_delayed] ");
-
- text_len += vscnprintf(text + text_len,
- sizeof(textbuf) - text_len, fmt, args);
+ text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
/* mark and strip a trailing newline */
if (text_len && text[text_len-1] == '\n') {
@@ -1653,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *end_of_header = printk_skip_level(text);
switch (kern_level) {
case '0' ... '7':
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
+ /* fallthrough */
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
}
@@ -1668,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dict)
@@ -1716,28 +1756,37 @@ asmlinkage int vprintk_emit(int facility, int level,
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
+ lockdep_off();
+ /*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk(this_cpu))
+ if (console_trylock_for_printk())
console_unlock();
+ preempt_enable();
+ lockdep_on();
}
- lockdep_on();
-out_restore_irqs:
- local_irq_restore(flags);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, -1, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1756,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
}
EXPORT_SYMBOL(printk_emit);
+int vprintk_default(const char *fmt, va_list args)
+{
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+ return r;
+ }
+#endif
+ r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -1779,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
+ printk_func_t vprintk_func;
va_list args;
int r;
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
va_start(args, fmt);
- r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+
+ /*
+ * If a caller overrides the per_cpu printk_func, then it needs
+ * to disable preemption when calling printk(). Otherwise
+ * the printk_func should be set to the default. No need to
+ * disable preemption here.
+ */
+ vprintk_func = this_cpu_read(printk_func);
+ r = vprintk_func(fmt, args);
+
va_end(args);
return r;
@@ -1802,7 +1877,7 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
-#define LOG_LINE_MAX 0
+
static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
@@ -1825,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
+
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
-void early_vprintk(const char *fmt, va_list ap)
-{
- if (early_console) {
- char buf[512];
- int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
- early_console->write(early_console, buf, n);
- }
-}
-
asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
+ char buf[512];
+ int n;
+
+ if (!early_console)
+ return;
va_start(ap, fmt);
- early_vprintk(fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
+
+ early_console->write(early_console, buf, n);
}
#endif
@@ -1881,11 +1956,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
return 0;
}
/*
- * Set up a list of consoles. Called from init/main.c
+ * Set up a console. Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
char *s, *options, *brl_options = NULL;
int idx;
@@ -1902,7 +1978,8 @@ static int __init console_setup(char *str)
strncpy(buf, str, sizeof(buf) - 1);
}
buf[sizeof(buf) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
+ options = strchr(str, ',');
+ if (options)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
@@ -1911,7 +1988,7 @@ static int __init console_setup(char *str)
strcpy(buf, "ttyS1");
#endif
for (s = buf; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
+ if (isdigit(*s) || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
@@ -1940,31 +2017,12 @@ int add_preferred_console(char *name, int idx, char *options)
return __add_preferred_console(name, idx, options, NULL);
}
-int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
-{
- struct console_cmdline *c;
- int i;
-
- for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
- i++, c++)
- if (strcmp(c->name, name) == 0 && c->index == idx) {
- strlcpy(c->name, name_new, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
- c->options = options;
- c->index = idx_new;
- return i;
- }
- /* not found */
- return -1;
-}
-
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
{
- console_suspend_enabled = 0;
+ console_suspend_enabled = false;
return 1;
}
__setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
*
- * Tried to acquire a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that the caller has exclusive
+ * access to the console system and the console_drivers list.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2360,9 +2418,6 @@ void register_console(struct console *newcon)
if (preferred_console < 0 || bcon || !console_drivers)
preferred_console = selected_console;
- if (newcon->early_setup)
- newcon->early_setup();
-
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
@@ -2388,22 +2443,27 @@ void register_console(struct console *newcon)
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
i++, c++) {
- if (strcmp(c->name, newcon->name) != 0)
- continue;
- if (newcon->index >= 0 &&
- newcon->index != c->index)
- continue;
- if (newcon->index < 0)
- newcon->index = c->index;
+ if (!newcon->match ||
+ newcon->match(newcon, c->name, c->index, c->options) != 0) {
+ /* default matching */
+ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+ if (strcmp(c->name, newcon->name) != 0)
+ continue;
+ if (newcon->index >= 0 &&
+ newcon->index != c->index)
+ continue;
+ if (newcon->index < 0)
+ newcon->index = c->index;
- if (_braille_register_console(newcon, c))
- return;
+ if (_braille_register_console(newcon, c))
+ return;
+
+ if (newcon->setup &&
+ newcon->setup(newcon, c->options) != 0)
+ break;
+ }
- if (newcon->setup &&
- newcon->setup(newcon, console_cmdline[i].options) != 0)
- break;
newcon->flags |= CON_ENABLED;
- newcon->index = c->index;
if (i == selected_console) {
newcon->flags |= CON_CONSDEV;
preferred_console = selected_console;
@@ -2570,7 +2630,7 @@ void wake_up_klogd(void)
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
@@ -2582,11 +2642,11 @@ int printk_deferred(const char *fmt, ...)
preempt_disable();
va_start(args, fmt);
- r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
return r;
@@ -2618,14 +2678,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
- if (*caller_jiffies == 0
- || !time_in_range(jiffies, *caller_jiffies,
- *caller_jiffies
- + msecs_to_jiffies(interval_msecs))) {
- *caller_jiffies = jiffies;
- return true;
- }
- return false;
+ unsigned long elapsed = jiffies - *caller_jiffies;
+
+ if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
+ return false;
+
+ *caller_jiffies = jiffies;
+ return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
- seq_cpumask(m, prof_cpu_mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
return 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..c8e0e050a36a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
#include <linux/compat.h>
-static int ptrace_trapping_sleep_fn(void *flags)
-{
- schedule();
- return 0;
-}
-
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
out:
if (!retval) {
wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ TASK_UNINTERRUPTIBLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
@@ -462,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
static int ptrace_detach(struct task_struct *child, unsigned int data)
{
- bool dead = false;
-
if (!valid_signal(data))
return -EIO;
@@ -473,54 +465,38 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
write_lock_irq(&tasklist_lock);
/*
- * This child can be already killed. Make sure de_thread() or
- * our sub-thread doing do_wait() didn't do release_task() yet.
+ * We rely on ptrace_freeze_traced(). It can't be killed and
+ * untraced by another thread, it can't be a zombie.
*/
- if (child->ptrace) {
- child->exit_code = data;
- dead = __ptrace_detach(current, child);
- }
+ WARN_ON(!child->ptrace || child->exit_state);
+ /*
+ * tasklist_lock avoids the race with wait_task_stopped(), see
+ * the comment in ptrace_resume().
+ */
+ child->exit_code = data;
+ __ptrace_detach(current, child);
write_unlock_irq(&tasklist_lock);
proc_ptrace_connector(child, PTRACE_DETACH);
- if (unlikely(dead))
- release_task(child);
return 0;
}
/*
* Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
- * and reacquire the lock.
+ * for writing.
*/
-void exit_ptrace(struct task_struct *tracer)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
struct task_struct *p, *n;
- LIST_HEAD(ptrace_dead);
-
- if (likely(list_empty(&tracer->ptraced)))
- return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
if (__ptrace_detach(tracer, p))
- list_add(&p->ptrace_entry, &ptrace_dead);
+ list_add(&p->ptrace_entry, dead);
}
-
- write_unlock_irq(&tasklist_lock);
- BUG_ON(!list_empty(&tracer->ptraced));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
- }
-
- write_lock_irq(&tasklist_lock);
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
@@ -720,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
+ bool need_siglock;
+
if (!valid_signal(data))
return -EIO;
@@ -747,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request,
user_disable_single_step(child);
}
+ /*
+ * Change ->exit_code and ->state under siglock to avoid the race
+ * with wait_task_stopped() in between; a non-zero ->exit_code will
+ * wrongly look like another report from tracee.
+ *
+ * Note that we need siglock even if ->exit_code == data and/or this
+ * status was not reported yet, the new status must not be cleared by
+ * wait_task_stopped() after resume.
+ *
+ * If data == 0 we do not care if wait_task_stopped() reports the old
+ * status and clears the code too; this can't race with the tracee, it
+ * takes siglock after resume.
+ */
+ need_siglock = data && !thread_group_empty(current);
+ if (need_siglock)
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
wake_up_state(child, __TASK_TRACED);
+ if (need_siglock)
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
@@ -1100,7 +1096,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
}
#if defined CONFIG_COMPAT
-#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
const struct range *r2 = x2;
- s64 start1, start2;
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
+ if (r1->start < r2->start)
+ return -1;
+ if (r1->start > r2->start)
+ return 1;
+ return 0;
}
int clean_sort_range(struct range *range, int az)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,7 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
- return 1;
+ return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
rcu_lock_release(&rcu_callback_map);
- return 0;
+ return false;
}
}
@@ -131,4 +135,12 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+void rcu_early_boot_tests(void);
+
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 948a7693748e..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -49,11 +49,19 @@
#include <linux/trace_clock.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include <linux/vmalloc.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+torture_param(int, cbflood_inter_holdoff, HZ,
+ "Holdoff between floods (jiffies)");
+torture_param(int, cbflood_intra_holdoff, 1,
+ "Holdoff between bursts (jiffies)");
+torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
+torture_param(int, cbflood_n_per_burst, 20000,
+ "# callbacks per burst in flood");
torture_param(int, fqs_duration, 0,
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
@@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
static int nrealreaders;
+static int ncbflooders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
+static struct task_struct **cbflood_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
@@ -138,6 +148,7 @@ static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes;
+static atomic_long_t n_cbfloods;
static struct list_head rcu_torture_removed;
static int rcu_torture_writer_state;
@@ -157,9 +168,9 @@ static int rcu_torture_writer_state;
#else
#define RCUTORTURE_RUNNABLE_INIT 0
#endif
-int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
@@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
-DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
static bool barrier_phase; /* Test phase. */
@@ -233,7 +244,8 @@ struct rcu_torture_ops {
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
- int (*completed)(void);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
@@ -242,7 +254,7 @@ struct rcu_torture_ops {
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
- void (*stats)(char *page);
+ void (*stats)(void);
int irq_capable;
int can_boost;
const char *name;
@@ -285,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
rcu_read_unlock();
}
-static int rcu_torture_completed(void)
-{
- return rcu_batches_completed();
-}
-
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -345,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
cur_ops->deferred_free(rp);
}
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
{
return 0;
}
@@ -366,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -396,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
rcu_read_unlock_bh();
}
-static int rcu_bh_torture_completed(void)
-{
- return rcu_batches_completed_bh();
-}
-
static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
{
call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -412,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
@@ -455,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .started = rcu_no_completed,
.completed = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -499,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
srcu_read_unlock(&srcu_ctl, idx);
}
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(&srcu_ctl);
}
@@ -525,21 +530,21 @@ static void srcu_torture_barrier(void)
srcu_barrier(&srcu_ctl);
}
-static void srcu_torture_stats(char *page)
+static void srcu_torture_stats(void)
{
int cpu;
int idx = srcu_ctl.completed & 0x1;
- page += sprintf(page, "%s%s per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
+ pr_alert("%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
long c0, c1;
c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
- page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
- sprintf(page, "\n");
+ pr_cont("\n");
}
static void srcu_torture_synchronize_expedited(void)
@@ -553,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .started = NULL,
.completed = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -589,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
@@ -601,6 +608,53 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks torture testing.
+ */
+
+static int tasks_torture_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_torture_read_unlock(int idx)
+{
+}
+
+static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops tasks_ops = {
+ .ttype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = tasks_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = tasks_torture_read_unlock,
+ .started = rcu_no_completed,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_tasks_torture_deferred_free,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .call = call_rcu_tasks,
+ .cb_barrier = rcu_barrier_tasks,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "tasks"
+};
+
+#define RCUTORTURE_TASKS_OPS &tasks_ops,
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUTORTURE_TASKS_OPS
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -667,7 +721,7 @@ static int rcu_torture_boost(void *arg)
}
call_rcu_time = jiffies;
}
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
@@ -707,6 +761,59 @@ checkwait: stutter_wait("rcu_torture_boost");
return 0;
}
+static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
+{
+}
+
+/*
+ * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
+ * to call_rcu() or analogous, increasing the probability of occurrence
+ * of callback-overflow corner cases.
+ */
+static int
+rcu_torture_cbflood(void *arg)
+{
+ int err = 1;
+ int i;
+ int j;
+ struct rcu_head *rhp;
+
+ if (cbflood_n_per_burst > 0 &&
+ cbflood_inter_holdoff > 0 &&
+ cbflood_intra_holdoff > 0 &&
+ cur_ops->call &&
+ cur_ops->cb_barrier) {
+ rhp = vmalloc(sizeof(*rhp) *
+ cbflood_n_burst * cbflood_n_per_burst);
+ err = !rhp;
+ }
+ if (err) {
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
+ while (!torture_must_stop())
+ schedule_timeout_interruptible(HZ);
+ return 0;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
+ do {
+ schedule_timeout_interruptible(cbflood_inter_holdoff);
+ atomic_long_inc(&n_cbfloods);
+ WARN_ON(signal_pending(current));
+ for (i = 0; i < cbflood_n_burst; i++) {
+ for (j = 0; j < cbflood_n_per_burst; j++) {
+ cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
+ rcu_torture_cbflood_cb);
+ }
+ schedule_timeout_interruptible(cbflood_intra_holdoff);
+ WARN_ON(signal_pending(current));
+ }
+ cur_ops->cb_barrier();
+ stutter_wait("rcu_torture_cbflood");
+ } while (!torture_must_stop());
+ vfree(rhp);
+ torture_kthread_stopping("rcu_torture_cbflood");
+ return 0;
+}
+
/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
@@ -746,6 +853,8 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
+ bool can_expedite = !rcu_gp_is_expedited();
+ int expediting = 0;
unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
bool gp_sync1 = gp_sync;
@@ -758,9 +867,15 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ pr_alert("%s" TORTURE_FLAG
+ " Grace periods expedited from boot/sysfs for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " Testing of dynamic grace-period expediting diabled.\n",
+ torture_type);
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
synctype[nsynctypes++] = RTWS_COND_GET;
@@ -842,9 +957,26 @@ rcu_torture_writer(void *arg)
}
}
rcutorture_record_progress(++rcu_torture_current_version);
+ /* Cycle through nesting levels of rcu_expedite_gp() calls. */
+ if (can_expedite &&
+ !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+ WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+ if (expediting >= 0)
+ rcu_expedite_gp();
+ else
+ rcu_unexpedite_gp();
+ if (++expediting > 3)
+ expediting = -expediting;
+ }
rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
} while (!torture_must_stop());
+ /* Reset expediting back to unexpedited. */
+ if (expediting > 0)
+ expediting = -expediting;
+ while (can_expedite && expediting++ < 0)
+ rcu_unexpedite_gp();
+ WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -905,8 +1037,8 @@ static void rcutorture_trace_dump(void)
static void rcu_torture_timer(unsigned long unused)
{
int idx;
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
static DEFINE_TORTURE_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
@@ -914,7 +1046,10 @@ static void rcu_torture_timer(unsigned long unused)
unsigned long long ts;
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -937,14 +1072,16 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
- completed, completed_end);
+ started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -963,8 +1100,8 @@ static void rcu_torture_timer(unsigned long unused)
static int
rcu_torture_reader(void *arg)
{
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
int idx;
DEFINE_TORTURE_RANDOM(rand);
struct rcu_torture *p;
@@ -983,7 +1120,10 @@ rcu_torture_reader(void *arg)
mod_timer(&t, jiffies + 1);
}
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -1004,14 +1144,16 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, completed, completed_end);
+ ts, started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1019,7 +1161,7 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- cond_resched();
+ cond_resched_rcu_qs();
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
@@ -1031,10 +1173,15 @@ rcu_torture_reader(void *arg)
}
/*
- * Create an RCU-torture statistics message in the specified buffer.
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
*/
static void
-rcu_torture_printk(char *page)
+rcu_torture_stats_print(void)
{
int cpu;
int i;
@@ -1052,55 +1199,61 @@ rcu_torture_printk(char *page)
if (pipesummary[i] != 0)
break;
}
- page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page,
- "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
- rcu_torture_current,
- rcu_torture_current_version,
- list_empty(&rcu_torture_freelist),
- atomic_read(&n_rcu_torture_alloc),
- atomic_read(&n_rcu_torture_alloc_fail),
- atomic_read(&n_rcu_torture_free));
- page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
- atomic_read(&n_rcu_torture_mberror),
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
- page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
- n_rcu_torture_boost_failure,
- n_rcu_torture_boosts,
- n_rcu_torture_timers);
- page = torture_onoff_stats(page);
- page += sprintf(page, "barrier: %ld/%ld:%ld",
- n_barrier_successes,
- n_barrier_attempts,
- n_rcu_torture_barrier_error);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free));
+ pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror);
+ pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
+ n_rcu_torture_timers);
+ torture_onoff_stats();
+ pr_cont("barrier: %ld/%ld:%ld ",
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
n_rcu_torture_boost_failure != 0 ||
i > 1) {
- page += sprintf(page, "!!! ");
+ pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
}
- page += sprintf(page, "Reader Pipe: ");
+ pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- page += sprintf(page, " %ld", pipesummary[i]);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page, "Reader Batch: ");
+ pr_cont(" %ld", pipesummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Reader Batch: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- page += sprintf(page, " %ld", batchsummary[i]);
- page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
- page += sprintf(page, "Free-Block Circulation: ");
+ pr_cont(" %ld", batchsummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Free-Block Circulation: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- page += sprintf(page, " %d",
- atomic_read(&rcu_torture_wcount[i]));
+ pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
}
- page += sprintf(page, "\n");
+ pr_cont("\n");
+
if (cur_ops->stats)
- cur_ops->stats(page);
+ cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
rcu_torture_current != NULL) {
int __maybe_unused flags;
@@ -1109,10 +1262,9 @@ rcu_torture_printk(char *page)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- page += sprintf(page,
- "??? Writer stall state %d g%lu c%lu f%#x\n",
- rcu_torture_writer_state,
- gpnum, completed, flags);
+ pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
show_rcu_gp_kthreads();
rcutorture_trace_dump();
}
@@ -1120,30 +1272,6 @@ rcu_torture_printk(char *page)
}
/*
- * Print torture statistics. Caller must ensure that there is only
- * one call to this function at a given time!!! This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
- int size = nr_cpu_ids * 200 + 8192;
- char *buf;
-
- buf = kmalloc(size, GFP_KERNEL);
- if (!buf) {
- pr_err("rcu-torture: Out of memory, need: %d", size);
- return;
- }
- rcu_torture_printk(buf);
- pr_alert("%s", buf);
- kfree(buf);
-}
-
-/*
* Periodically prints torture statistics, if periodic statistics printing
* was specified via the stat_interval module parameter.
*/
@@ -1295,7 +1423,8 @@ static int rcu_torture_barrier_cbs(void *arg)
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
- cur_ops->cb_barrier();
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
destroy_rcu_head_on_stack(&rcu);
torture_kthread_stopping("rcu_torture_barrier_cbs");
return 0;
@@ -1323,6 +1452,9 @@ static int rcu_torture_barrier(void *arg)
cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
n_rcu_torture_barrier_error++;
+ pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+ atomic_read(&barrier_cbs_invoked),
+ n_barrier_cbs);
WARN_ON_ONCE(1);
}
n_barrier_successes++;
@@ -1418,7 +1550,7 @@ rcu_torture_cleanup(void)
int i;
rcutorture_record_test_transition();
- if (torture_cleanup()) {
+ if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
return;
@@ -1447,6 +1579,8 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
+ for (i = 0; i < ncbflooders; i++)
+ torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
unregister_cpu_notifier(&rcutorture_cpu_nb);
@@ -1468,6 +1602,7 @@ rcu_torture_cleanup(void)
"End of test: RCU_HOTPLUG");
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+ torture_cleanup_end();
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -1534,9 +1669,10 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+ RCUTORTURE_TASKS_OPS
};
- if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -1693,6 +1829,24 @@ rcu_torture_init(void)
goto unwind;
if (object_debug)
rcu_test_debug_objects();
+ if (cbflood_n_burst > 0) {
+ /* Create the cbflood threads */
+ ncbflooders = (num_online_cpus() + 3) / 4;
+ cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
+ GFP_KERNEL);
+ if (!cbflood_task) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < ncbflooders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_cbflood,
+ NULL,
+ cbflood_task[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ }
rcutorture_record_test_transition();
torture_init_end();
return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..cad76e76b4e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx;
}
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
}
EXPORT_SYMBOL_GPL(call_srcu);
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
- struct rcu_synchronize *rcu;
-
- rcu = container_of(head, struct rcu_synchronize, head);
- complete(&rcu->completion);
-}
-
static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
static void srcu_reschedule(struct srcu_struct *sp);
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_expedited
+ __synchronize_srcu(sp, rcu_gp_is_expedited()
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
@@ -546,7 +529,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
{
return sp->completed;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-
#include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
- if (newval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("--="),
- rcu_dynticks_nesting, newval));
- rcu_dynticks_nesting = newval;
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
- rcu_dynticks_nesting, newval));
- if (!is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
- rcu_dynticks_nesting, newval));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
- rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
- barrier();
- rcu_dynticks_nesting = newval;
-}
-
/*
* Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
- if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE)
- newval = 0;
- else
- newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- newval = rcu_dynticks_nesting - 1;
- WARN_ON_ONCE(newval < 0);
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
- if (oldval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("++="),
- oldval, rcu_dynticks_nesting));
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
- if (!is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
- oldval, rcu_dynticks_nesting));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
/*
* Exit idle, so that we are no longer in an extended quiescent state.
*/
void rcu_idle_exit(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- WARN_ON_ONCE(rcu_dynticks_nesting < 0);
- if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
- rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- else
- rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting++;
- WARN_ON_ONCE(rcu_dynticks_nesting == 0);
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
*/
bool notrace __rcu_is_watching(void)
{
- return rcu_dynticks_nesting;
+ return true;
}
EXPORT_SYMBOL(__rcu_is_watching);
#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
/*
- * Test whether the current CPU was interrupted from idle. Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
- return rcu_dynticks_nesting <= 1;
-}
-
-/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
* invoking call_rcu().
@@ -203,8 +103,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
RCU_TRACE(reset_cpu_stall_ticks(rcp));
- if (rcp->rcucblist != NULL &&
- rcp->donetail != rcp->curtail) {
+ if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
}
@@ -217,7 +116,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
* are at it, given that any rcu quiescent state is also an rcu_bh
* quiescent state. Use "+" instead of "||" to defeat short circuiting.
*/
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
{
unsigned long flags;
@@ -231,7 +130,7 @@ void rcu_sched_qs(int cpu)
/*
* Record an rcu_bh quiescent state.
*/
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
{
unsigned long flags;
@@ -247,13 +146,15 @@ void rcu_bh_qs(int cpu)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
- if (user || rcu_is_cpu_rrupt_from_idle())
- rcu_sched_qs(cpu);
+ if (user)
+ rcu_sched_qs();
else if (!in_softirq())
- rcu_bh_qs(cpu);
+ rcu_bh_qs();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
}
/*
@@ -267,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
unsigned long flags;
RCU_TRACE(int cb_count = 0);
- /* If no RCU callbacks ready to invoke, just return. */
- if (&rcp->rcucblist == rcp->donetail) {
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
- !!ACCESS_ONCE(rcp->rcucblist),
- need_resched(),
- is_idle_task(current),
- false));
- return;
- }
-
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
@@ -355,6 +245,11 @@ static void __call_rcu(struct rcu_head *head,
rcp->curtail = &head->next;
RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_sched_qs() */
+ resched_cpu(0);
+ }
}
/*
@@ -378,7 +273,11 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_init(void)
+void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
+
+ rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
rcp->ticks_this_gp++;
j = jiffies;
js = ACCESS_ONCE(rcp->jiffies_stall);
- if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- }
- if (*rcp->curtail && ULONG_CMP_GE(j, js))
ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
- else if (ULONG_CMP_GE(j, js))
+ } else if (ULONG_CMP_GE(j, js)) {
ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ }
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 625d0b0cd75a..233165da782f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,11 +79,22 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
* the tracing userspace tools to be able to decipher the string
* address to the matching string.
*/
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+#ifdef CONFIG_TRACING
+# define DEFINE_RCU_TPS(sname) \
static char sname##_varname[] = #sname; \
-static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
+# define RCU_STATE_NAME(sname) sname##_varname
+#else
+# define DEFINE_RCU_TPS(sname)
+# define RCU_STATE_NAME(sname) __stringify(sname)
+#endif
+
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
+ .rda = &sname##_data, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
@@ -92,11 +103,9 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
- .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
- .name = sname##_varname, \
+ .name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
-}; \
-DEFINE_PER_CPU(struct rcu_data, sname##_data)
+}
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -143,23 +152,22 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
+/* Delay in jiffies for grace-period initialization delays. */
+static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+ ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+ : 0;
+module_param(gp_init_delay, int, 0644);
+
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -188,22 +207,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
* one since the start of the grace period, this just sets a flag.
* The caller must have disabled preemption.
*/
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
+ if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+ }
}
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
+ if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_bh"),
+ __this_cpu_read(rcu_bh_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+ }
}
static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
@@ -217,6 +238,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
/*
* Let the RCU core know that this CPU has gone through the scheduler,
* which is a quiescent state. This is called when the need for a
@@ -275,17 +299,33 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled preemption.
*/
-void rcu_note_context_switch(int cpu)
+void rcu_note_context_switch(void)
{
trace_rcu_utilization(TPS("Start context switch"));
- rcu_sched_qs(cpu);
- rcu_preempt_note_context_switch(cpu);
+ rcu_sched_qs();
+ rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiescent state for all RCU flavors. If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desperate need of a quiescent state, which will normally
+ * be none of them). Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -314,21 +354,57 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp);
-static int rcu_pending(int cpu);
+static int rcu_pending(void);
+
+/*
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+ return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+ return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
/*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches started thus far for debug & stats.
*/
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+ return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+ return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
{
return rcu_sched_state.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
/*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
*/
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
{
return rcu_bh_state.completed;
}
@@ -353,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -426,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
EXPORT_SYMBOL_GPL(rcutorture_record_progress);
/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
-/*
* Does the CPU have callbacks ready to be invoked?
*/
static int
@@ -499,11 +575,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
* we really have entered idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
- bool user)
+static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
@@ -520,12 +596,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
- rcu_prepare_for_idle(smp_processor_id());
+ rcu_prepare_for_idle();
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_task_enter();
/*
* It is illegal to enter an extended quiescent state while
@@ -553,7 +630,7 @@ static void rcu_eqs_enter(bool user)
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(rdtp, oldval, user);
+ rcu_eqs_enter_common(oldval, user);
} else {
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
@@ -577,7 +654,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -627,8 +704,8 @@ void rcu_irq_exit(void)
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_enter_common(rdtp, oldval, true);
- rcu_sysidle_enter(rdtp, 1);
+ rcu_eqs_enter_common(oldval, true);
+ rcu_sysidle_enter(1);
local_irq_restore(flags);
}
@@ -639,15 +716,17 @@ void rcu_irq_exit(void)
* we really have exited idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
- int user)
+static void rcu_eqs_exit_common(long long oldval, int user)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ rcu_dynticks_task_exit();
smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle(smp_processor_id());
+ rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
@@ -678,7 +757,7 @@ static void rcu_eqs_exit(bool user)
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(rdtp, oldval, user);
+ rcu_eqs_exit_common(oldval, user);
}
}
@@ -699,7 +778,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -750,47 +829,79 @@ void rcu_irq_enter(void)
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_exit_common(rdtp, oldval, true);
- rcu_sysidle_exit(rdtp, 1);
+ rcu_eqs_exit_common(oldval, true);
+ rcu_sysidle_exit(1);
local_irq_restore(flags);
}
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
*/
void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int incby = 2;
- if (rdtp->dynticks_nmi_nesting == 0 &&
- (atomic_read(&rdtp->dynticks) & 0x1))
- return;
- rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ /* Complain about underflow. */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+ smp_mb__before_atomic(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* atomic_inc() before later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ incby = 1;
+ }
+ rdtp->dynticks_nmi_nesting += incby;
+ barrier();
}
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
*/
void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- if (rdtp->dynticks_nmi_nesting == 0 ||
- --rdtp->dynticks_nmi_nesting != 0)
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ rdtp->dynticks_nmi_nesting -= 2;
return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ rdtp->dynticks_nmi_nesting = 0;
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
@@ -819,7 +930,7 @@ bool notrace __rcu_is_watching(void)
*/
bool notrace rcu_is_watching(void)
{
- int ret;
+ bool ret;
preempt_disable();
ret = __rcu_is_watching();
@@ -862,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable();
rdp = this_cpu_ptr(&rcu_sched_data);
rnp = rdp->mynode;
- ret = (rdp->grpmask & rnp->qsmaskinit) ||
+ ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
!rcu_scheduler_fully_active;
preempt_enable();
return ret;
@@ -897,17 +1008,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
+ if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ rdp->mynode->gpnum))
+ ACCESS_ONCE(rdp->gpwrap) = true;
return 0;
}
}
/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1010,13 +1118,26 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
j1 = rcu_jiffies_till_stall_check();
ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
+ rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
}
/*
- * Dump stacks of all tasks running on stalled CPUs. This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+ unsigned long gpa;
+ unsigned long j;
+
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ if (j - gpa > 2 * HZ)
+ pr_err("%s kthread starved for %ld jiffies!\n",
+ rsp->name, j - gpa);
+}
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1035,11 +1156,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
long delta;
unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1077,30 +1200,35 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- /*
- * Now rat on any tasks that got kicked up to the root rcu_node
- * due to CPU offlining.
- */
- rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected += rcu_print_task_stall(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (ndetected == 0)
- pr_err("INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
+ if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+ } else {
+ if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+ ACCESS_ONCE(rsp->completed) == gpnum) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rsp->name, j - gpa, j, gpa,
+ jiffies_till_next_fqs,
+ rcu_get_root(rsp)->qsmask);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
/* Complain about tasks blocking the grace period. */
-
rcu_print_detail_task_stall(rsp);
+ rcu_check_gp_kthread_starvation(rsp);
+
force_quiescent_state(rsp); /* Kick them all. */
}
@@ -1125,8 +1253,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
+
+ rcu_check_gp_kthread_starvation(rsp);
+
+ rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1196,7 +1326,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
/* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(rsp);
+ print_other_cpu_stall(rsp, gpnum);
}
}
@@ -1218,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
}
/*
- * Initialize the specified rcu_data structure's callback list to empty.
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty. The default callback list is the one that is not used by
+ * no-callbacks CPUs.
*/
-static void init_callback_list(struct rcu_data *rdp)
+static void init_default_callback_list(struct rcu_data *rdp)
{
int i;
- if (init_nocb_callback_list(rdp))
- return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
}
/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ if (init_nocb_callback_list(rdp))
+ return;
+ init_default_callback_list(rdp);
+}
+
+/*
* Determine the value that ->completed will have at the end of the
* next subsequent grace period. This is used to tag callbacks so that
* a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1305,10 +1445,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
- * need to explicitly start one.
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1527,7 +1673,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
/* Handle the ends of any preceding grace periods first. */
- if (rdp->completed == rnp->completed) {
+ if (rdp->completed == rnp->completed &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1542,7 +1689,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1551,8 +1698,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
+ ACCESS_ONCE(rdp->gpwrap) = false;
}
return ret;
}
@@ -1566,7 +1715,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+ rdp->completed == ACCESS_ONCE(rnp->completed) &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1583,10 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static int rcu_gp_init(struct rcu_state *rsp)
{
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- rcu_bind_gp_kthread();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1612,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
- /* Exclude any concurrent CPU-hotplug operations. */
- mutex_lock(&rsp->onoff_mutex);
- smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
+ /*
+ * Apply per-leaf buffered online and offline operations to the
+ * rcu_node tree. Note that this new grace period need not wait
+ * for subsequent online CPUs, and that quiescent-state forcing
+ * will handle subsequent offline CPUs.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+ !rnp->wait_blkd_tasks) {
+ /* Nothing to do on this leaf rcu_node structure. */
+ raw_spin_unlock_irq(&rnp->lock);
+ continue;
+ }
+
+ /* Record old state, apply changes to ->qsmaskinit field. */
+ oldmask = rnp->qsmaskinit;
+ rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+ /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+ if (!oldmask != !rnp->qsmaskinit) {
+ if (!oldmask) /* First online CPU for this rcu_node. */
+ rcu_init_new_rnp(rnp);
+ else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+ rnp->wait_blkd_tasks = true;
+ else /* Last offline CPU and can propagate. */
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ /*
+ * If all waited-on tasks from prior grace period are
+ * done, and if all this rcu_node structure's CPUs are
+ * still offline, propagate up the rcu_node tree and
+ * clear ->wait_blkd_tasks. Otherwise, if one of this
+ * rcu_node structure's CPUs has since come back online,
+ * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+ * checks for this, so just call it unconditionally).
+ */
+ if (rnp->wait_blkd_tasks &&
+ (!rcu_preempt_has_tasks(rnp) ||
+ rnp->qsmaskinit)) {
+ rnp->wait_blkd_tasks = false;
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ raw_spin_unlock_irq(&rnp->lock);
+ }
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1636,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
- WARN_ON_ONCE(rnp->completed != rsp->completed);
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
(void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1645,15 +1841,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
- system_state == SYSTEM_RUNNING)
- udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
- cond_resched();
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+ gp_init_delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * 10)))
+ schedule_timeout_uninterruptible(gp_init_delay);
}
- mutex_unlock(&rsp->onoff_mutex);
return 1;
}
@@ -1667,11 +1862,12 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
if (is_sysidle_rcu_state(rsp)) {
- isidle = 1;
+ isidle = true;
maxj = jiffies - ULONG_MAX / 4;
}
force_qs_rnp(rsp, dyntick_save_progress_counter,
@@ -1680,14 +1876,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = 0;
+ isidle = true;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1704,6 +1901,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1732,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ WARN_ON_ONCE(rnp->qsmask);
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
@@ -1739,7 +1939,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
- cond_resched();
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1774,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
struct rcu_state *rsp = arg;
struct rcu_node *rnp = rcu_get_root(rsp);
+ rcu_bind_gp_kthread();
for (;;) {
/* Handle grace-period start. */
@@ -1788,8 +1990,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched();
- flush_signals(current);
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("reqwaitsig"));
@@ -1831,11 +2034,13 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched();
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
} else {
/* Deal with stray signal. */
- cond_resched();
- flush_signals(current);
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
@@ -1931,32 +2136,39 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
* Similar to rcu_report_qs_rdp(), for which it is a helper function.
* Allows quiescent states for a group of CPUs to be reported at one go
* to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be). That structure's
- * lock must be held upon entry, and it is released before return.
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be). The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps. That structure's lock
+ * must be held upon entry, and it is released before return.
*/
static void
rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long flags)
+ struct rcu_node *rnp, unsigned long gps, unsigned long flags)
__releases(rnp->lock)
{
+ unsigned long oldmask = 0;
struct rcu_node *rnp_c;
/* Walk up the rcu_node hierarchy. */
for (;;) {
- if (!(rnp->qsmask & mask)) {
+ if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
- /* Our bit has already been cleared, so done. */
+ /*
+ * Our bit has already been cleared, or the
+ * relevant grace period is already over, so done.
+ */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
+ WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -1980,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
rnp = rnp->parent;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rnp_c->qsmask);
+ oldmask = rnp_c->qsmask;
}
/*
@@ -1992,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
}
/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period. The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long gps;
+ unsigned long mask;
+ struct rcu_node *rnp_p;
+
+ if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+ rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return; /* Still need more quiescent states! */
+ }
+
+ rnp_p = rnp->parent;
+ if (rnp_p == NULL) {
+ /*
+ * Only one rcu_node structure in the tree, so don't
+ * try to report up to its nonexistent parent!
+ */
+ rcu_report_qs_rsp(rsp, flags);
+ return;
+ }
+
+ /* Report up the rest of the hierarchy, tracking current ->gpnum. */
+ gps = rnp->gpnum;
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+}
+
+/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
* structure. This must be either called from the specified CPU, or
* called when the specified CPU is known to be offline (and when it is
@@ -2011,8 +2263,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
- rnp->completed == rnp->gpnum) {
+ if ((rdp->passed_quiesce == 0 &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+ rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+ rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2021,6 +2275,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->passed_quiesce = 0; /* need qs for new gp. */
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -2036,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
*/
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ /* ^^^ Released rnp->lock */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -2065,7 +2321,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesce)
+ if (!rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
return;
/*
@@ -2128,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
- /* Finally, initialize the rcu_data structure's list to empty. */
+ /*
+ * Finally, initialize the rcu_data structure's list to empty and
+ * disallow further callbacks on this CPU.
+ */
init_callback_list(rdp);
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
}
/*
@@ -2196,6 +2457,67 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
}
/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ rnp->qsmaskinit &= ~mask;
+ rnp->qsmask &= ~mask;
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+}
+
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinitnext &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2205,64 +2527,21 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
- int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
-
- /* Exclude any attempts to start a new grace period. */
- mutex_lock(&rsp->onoff_mutex);
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
rcu_adopt_orphan_cbs(rsp, flags);
+ raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
- mask = rdp->grpmask; /* rnp->grplo is constant. */
- do {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rnp->qsmaskinit &= ~mask;
- if (rnp->qsmaskinit != 0) {
- if (rnp != rdp->mynode)
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- break;
- }
- if (rnp == rdp->mynode)
- need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
- else
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- mask = rnp->grpmask;
- rnp = rnp->parent;
- } while (rnp != NULL);
-
- /*
- * We still hold the leaf rcu_node structure lock here, and
- * irqs are still disabled. The reason for this subterfuge is
- * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
- * held leads to deadlock.
- */
- raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
- rnp = rdp->mynode;
- if (need_report & RCU_OFL_TASKS_NORM_GP)
- rcu_report_unblock_qs_rnp(rnp, flags);
- else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (need_report & RCU_OFL_TASKS_EXP_GP)
- rcu_report_exp_rnp(rsp, rnp, true);
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
- init_callback_list(rdp);
- /* Disallow further callbacks on this CPU. */
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
- mutex_unlock(&rsp->onoff_mutex);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2271,6 +2550,14 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
}
@@ -2347,7 +2634,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) -= count;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2378,7 +2665,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
increment_cpu_stall_ticks();
@@ -2396,8 +2683,8 @@ void rcu_check_callbacks(int cpu, int user)
* at least not while the corresponding CPU is online.
*/
- rcu_sched_qs(cpu);
- rcu_bh_qs(cpu);
+ rcu_sched_qs();
+ rcu_bh_qs();
} else if (!in_softirq()) {
@@ -2408,11 +2695,13 @@ void rcu_check_callbacks(int cpu, int user)
* critical section, so note it.
*/
- rcu_bh_qs(cpu);
+ rcu_bh_qs();
}
- rcu_preempt_check_callbacks(cpu);
- if (rcu_pending(cpu))
+ rcu_preempt_check_callbacks();
+ if (rcu_pending())
invoke_rcu_core();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2435,7 +2724,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched();
+ cond_resched_rcu_qs();
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -2444,32 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
return;
}
if (rnp->qsmask == 0) {
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- continue;
+ if (rcu_state_p == &rcu_sched_state ||
+ rsp != rcu_state_p ||
+ rcu_preempt_blocked_readers_cgp(rnp)) {
+ /*
+ * No point in scanning bits because they
+ * are all zero. But we might need to
+ * priority-boost blocked readers.
+ */
+ rcu_initiate_boost(rnp, flags);
+ /* rcu_initiate_boost() releases rnp->lock */
+ continue;
+ }
+ if (rnp->parent &&
+ (rnp->parent->qsmask & rnp->grpmask)) {
+ /*
+ * Race between grace-period
+ * initialization and task exiting RCU
+ * read-side critical section: Report.
+ */
+ rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+ /* rcu_report_unblock_qs_rnp() rlses ->lock */
+ continue;
+ }
}
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) != 0)
- *isidle = 0;
+ if ((rnp->qsmaskinit & bit) == 0)
+ *isidle = false; /* Pending hotplug. */
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
}
if (mask != 0) {
-
- /* rcu_report_qs_rnp() releases rnp->lock. */
- rcu_report_qs_rnp(mask, rsp, rnp, flags);
- continue;
+ /* Idle/offline CPUs, report (releases rnp->lock. */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ } else {
+ /* Nothing to do here, so just drop the lock. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
- rnp = rcu_get_root(rsp);
- if (rnp->qsmask == 0) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
}
}
@@ -2485,14 +2789,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
return;
}
rnp_old = rnp;
@@ -2504,13 +2808,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -2569,7 +2874,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
* Schedule RCU callback invocation. If the specified type of RCU
* does not support RCU priority boosting, just do a direct call,
* otherwise wake up the per-CPU kernel kthread. Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
* rcu_cpu_kthread_task cannot disappear out from under us.
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -2601,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
*/
- if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+ if (!rcu_is_watching())
invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2662,7 +2967,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2687,13 +2992,24 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (cpu != -1)
rdp = per_cpu_ptr(rsp->rda, cpu);
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* _call_rcu() is illegal on offline CPU; leak the callback. */
- local_irq_restore(flags);
- return;
+ if (likely(rdp->mynode)) {
+ /* Post-boot, so this should be for a no-CBs CPU. */
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+ WARN_ON_ONCE(offline);
+ /* Offline CPU, _call_rcu() illegal, leak callback. */
+ local_irq_restore(flags);
+ return;
+ }
+ /*
+ * Very early boot, before rcu_init(). Initialize if needed
+ * and then drop through to queue the callback.
+ */
+ BUG_ON(cpu != -1);
+ WARN_ON_ONCE(!rcu_is_watching());
+ if (!likely(rdp->nxtlist))
+ init_default_callback_list(rdp);
}
- ACCESS_ONCE(rdp->qlen)++;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
rdp->qlen_lazy++;
else
@@ -2814,7 +3130,7 @@ void synchronize_sched(void)
"Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_sched_expedited();
else
wait_rcu_gp(call_rcu_sched);
@@ -2841,7 +3157,7 @@ void synchronize_rcu_bh(void)
"Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_bh_expedited();
else
wait_rcu_gp(call_rcu_bh);
@@ -2928,11 +3244,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock.
- *
* This implementation can be thought of as an application of ticket
* locking to RCU, with sync_sched_expedited_started and
* sync_sched_expedited_done taking on the roles of the halves
@@ -2956,6 +3267,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
+ cpumask_var_t cm;
+ bool cma = false;
+ int cpu;
long firstsnap, s, snap;
int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
@@ -2982,14 +3296,34 @@ void synchronize_sched_expedited(void)
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
firstsnap = snap;
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ return;
+ }
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+ if (cma) {
+ cpumask_copy(cm, cpu_online_mask);
+ cpumask_clear_cpu(raw_smp_processor_id(), cm);
+ for_each_cpu(cpu, cm) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ cpumask_clear_cpu(cpu, cm);
+ }
+ if (cpumask_weight(cm) == 0)
+ goto all_cpus_idle;
+ }
+
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cpu_online_mask,
+ while (try_stop_cpus(cma ? cm : cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
@@ -3001,6 +3335,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
+ free_cpumask_var(cm);
return;
}
@@ -3010,6 +3345,7 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
@@ -3019,6 +3355,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
+ free_cpumask_var(cm);
return;
}
@@ -3029,12 +3366,21 @@ void synchronize_sched_expedited(void)
* and they started after our first try, so their grace
* period works for us.
*/
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, use normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
+ return;
+ }
snap = atomic_long_read(&rsp->expedited_start);
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
+all_cpus_idle:
+ free_cpumask_var(cm);
+
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3079,9 +3425,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce) {
+ rdp->qs_pending && !rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_qs_pending++;
- } else if (rdp->qs_pending && rdp->passed_quiesce) {
+ } else if (rdp->qs_pending &&
+ (rdp->passed_quiesce ||
+ rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3105,7 +3454,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+ if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3126,12 +3476,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
-static int rcu_pending(int cpu)
+static int rcu_pending(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
return 1;
return 0;
}
@@ -3141,7 +3491,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3149,7 +3499,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!rdp->nxtlist)
continue;
hc = true;
@@ -3257,7 +3607,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
* ACCESS_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3282,11 +3632,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
- atomic_inc(&rsp->barrier_cpu_count);
- __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
- rsp, cpu, 0);
+ if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
+ _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ rsp->n_barrier_done);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ rsp->n_barrier_done);
+ smp_mb__before_atomic();
+ atomic_inc(&rsp->barrier_cpu_count);
+ __call_rcu(&rdp->barrier_head,
+ rcu_barrier_callback, rsp, cpu, 0);
+ }
} else if (ACCESS_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
@@ -3307,7 +3663,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3338,6 +3694,28 @@ void rcu_barrier_sched(void)
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online. The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (rnp == NULL)
+ return;
+ raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ rnp->qsmaskinit |= mask;
+ raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ }
+}
+
+/*
* Do boot-time initialization of a CPU's per-CPU RCU data.
*/
static void __init
@@ -3350,9 +3728,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- init_callback_list(rdp);
- rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3376,48 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
- /* Exclude new grace periods. */
- mutex_lock(&rsp->onoff_mutex);
-
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ if (!rdp->nxtlist)
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /* Add CPU to rcu_node bitmasks. */
+ /*
+ * Add CPU to leaf rcu_node pending-online bitmask. Any needed
+ * propagation up the rcu_node tree will happen at the beginning
+ * of the next grace period.
+ */
rnp = rdp->mynode;
mask = rdp->grpmask;
- do {
- /* Exclude any attempts to start a new GP on small systems. */
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->qsmaskinit |= mask;
- mask = rnp->grpmask;
- if (rnp == rdp->mynode) {
- /*
- * If there is a grace period in progress, we will
- * set up to wait for it next time we run the
- * RCU core code.
- */
- rdp->gpnum = rnp->completed;
- rdp->completed = rnp->completed;
- rdp->passed_quiesce = 0;
- rdp->qs_pending = 0;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
- }
- raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
- rnp = rnp->parent;
- } while (rnp != NULL && !(rnp->qsmaskinit & mask));
- local_irq_restore(flags);
-
- mutex_unlock(&rsp->onoff_mutex);
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rnp->qsmaskinitnext |= mask;
+ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+ rdp->completed = rnp->completed;
+ rdp->passed_quiesce = false;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->qs_pending = false;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
static void rcu_prepare_cpu(int cpu)
@@ -3431,20 +3795,20 @@ static void rcu_prepare_cpu(int cpu)
/*
* Handle CPU online/offline notification events.
*/
-static int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- trace_rcu_utilization(TPS("Start CPU hotplug"));
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_prepare_cpu(cpu);
rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
@@ -3458,17 +3822,23 @@ static int rcu_cpu_notify(struct notifier_block *self,
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
break;
+ case CPU_DYING_IDLE:
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dying_idle_cpu(cpu, rsp);
+ }
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp)
+ for_each_rcu_flavor(rsp) {
rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
break;
default:
break;
}
- trace_rcu_utilization(TPS("End CPU hotplug"));
return NOTIFY_OK;
}
@@ -3479,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedited = 1;
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- rcu_expedited = 0;
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3492,24 +3863,44 @@ static int rcu_pm_notify(struct notifier_block *self,
}
/*
- * Spawn the kthread that handles this RCU flavor's grace periods.
+ * Spawn the kthreads that handle each RCU flavor's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
+ int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct rcu_state *rsp;
+ struct sched_param sp;
struct task_struct *t;
+ /* Force priority into range. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+ kthread_prio, kthread_prio_in);
+
+ rcu_scheduler_fully_active = 1;
for_each_rcu_flavor(rsp) {
- t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+ t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
rsp->gp_kthread = t;
+ if (kthread_prio) {
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ wake_up_process(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rcu_spawn_nocb_kthreads(rsp);
}
+ rcu_spawn_nocb_kthreads();
+ rcu_spawn_boost_kthreads();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -3533,30 +3924,26 @@ void rcu_scheduler_starting(void)
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
*/
-#ifdef CONFIG_RCU_FANOUT_EXACT
static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int ccur;
- int cprv;
- int i;
-
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+ rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = rsp->levelcnt[i];
+ rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
}
}
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3564,14 +3951,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static char *fqs[] = { "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
@@ -3630,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
}
- rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
@@ -3723,6 +4111,8 @@ void __init rcu_init(void)
{
int cpu;
+ rcu_early_boot_tests();
+
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0f69a79c5b7d..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
-#include <linux/irq_work.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -139,15 +138,23 @@ struct rcu_node {
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
- /* complete (only for TREE_PREEMPT_RCU). */
+ /* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
+ /* Initialized from ->qsmaskinitnext at the */
+ /* beginning of each grace period. */
+ unsigned long qsmaskinitnext;
+ /* Online CPUs for next grace period. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
int grphi; /* highest-numbered CPU or group here. */
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
+ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+ /* exit RCU read-side critical sections */
+ /* before propagating offline up the */
+ /* rcu_node tree? */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* Tasks blocked in RCU read-side critical */
@@ -172,6 +179,9 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
@@ -249,9 +259,12 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
+ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+ /* for rcu_all_qs() invocations. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
+ bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -332,13 +345,25 @@ struct rcu_data {
#ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head *nocb_head; /* CBs waiting for kthread. */
struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
- atomic_long_t nocb_q_count_lazy; /* (approximate). */
- int nocb_p_count; /* # CBs being invoked by kthread */
- int nocb_p_count_lazy; /* (approximate). */
+ atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
+ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
- bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
@@ -357,6 +382,11 @@ struct rcu_data {
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
+/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOGP_WAKE_NOT 0
+#define RCU_NOGP_WAKE 1
+#define RCU_NOGP_WAKE_FORCE 2
+
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
/* and jiffies_till_next_fqs. */
@@ -426,8 +456,6 @@ struct rcu_state {
long qlen; /* Total number of callbacks. */
/* End of fields guarded by orphan_lock. */
- struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -457,10 +485,14 @@ struct rcu_state {
/* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
+ unsigned long gp_activity; /* Time of last GP kthread */
+ /* activity in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
+ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
+ /* GP start. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
const char *name; /* Name of structure. */
@@ -483,13 +515,6 @@ extern struct list_head rcu_struct_flavors;
#define for_each_rcu_flavor(rsp) \
list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-
-#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
- /* GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
- /* GP were moved to root. */
-
/*
* RCU implementation internal declarations:
*/
@@ -499,10 +524,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -515,27 +540,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
+static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
- unsigned long flags);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -546,15 +560,18 @@ static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(int cpu);
-static void rcu_prepare_for_idle(int cpu);
+static void rcu_cleanup_after_idle(void);
+static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -563,14 +580,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags);
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void __init rcu_spawn_nocb_kthreads(void);
+#ifdef CONFIG_RCU_NOCB_CPU
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_enter(int irq);
+static void rcu_sysidle_exit(int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
@@ -579,22 +600,21 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
static void rcu_bind_gp_kthread(void);
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
+static void rcu_dynticks_task_enter(void);
+static void rcu_dynticks_task_exit(void);
#endif /* #ifndef RCU_TREE_NONCORE */
#ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
-/* Sum up queue lengths for tracing. */
+/* Read out queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
-}
+#ifdef CONFIG_RCU_NOCB_CPU
+ *ql = atomic_long_read(&rdp->nocb_q_count);
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
*ql = 0;
*qll = 0;
-}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac0fb186b8..8c0ec0f5a027 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,19 +30,25 @@
#include <linux/smpboot.h>
#include "../time/tick-internal.h"
-#define RCU_KTHREAD_PRIO 1
-
#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
+
+#include "../locking/rtmutex_common.h"
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static char __initdata nocb_buf[NR_CPUS * 5];
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/*
@@ -52,73 +58,43 @@ static char __initdata nocb_buf[NR_CPUS * 5];
*/
static void __init rcu_bootup_announce_oddness(void)
{
-#ifdef CONFIG_RCU_TRACE
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
- pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
- pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
- pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
- pr_info("\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
- pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
- pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
+ if (IS_ENABLED(CONFIG_RCU_TRACE))
+ pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ CONFIG_RCU_FANOUT);
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+ pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ pr_info("\tRCU lockdep checking is enabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+ pr_info("\tRCU torture testing starts during boot.\n");
+ if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+ pr_info("\tAdditional per-CPU info printed with stalls.\n");
+ if (NUM_RCU_LVL_4 != 0)
+ pr_info("\tFour-level hierarchy is enabled.\n");
+ if (CONFIG_RCU_FANOUT_LEAF != 16)
+ pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+ CONFIG_RCU_FANOUT_LEAF);
if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_NOCB_CPU
-#ifndef CONFIG_RCU_NOCB_CPU_NONE
- if (!have_rcu_nocb_mask) {
- zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
- have_rcu_nocb_mask = true;
- }
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
- if (have_rcu_nocb_mask) {
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
- pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
- }
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ if (IS_ENABLED(CONFIG_RCU_BOOST))
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake);
/*
* Tell them what RCU they are running.
@@ -130,42 +106,24 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-long rcu_batches_completed_preempt(void)
-{
- return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
*
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- */
-static void rcu_preempt_qs(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
- if (rdp->passed_quiesce == 0)
- trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
- rdp->passed_quiesce = 1;
- current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ * As with the other rcu_*_qs() functions, callers to this function
+ * must disable preemption.
+ */
+static void rcu_preempt_qs(void)
+{
+ if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_preempt"),
+ __this_cpu_read(rcu_preempt_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
+ current->rcu_read_unlock_special.b.need_qs = false;
+ }
}
/*
@@ -181,7 +139,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -189,14 +147,14 @@ static void rcu_preempt_note_context_switch(int cpu)
struct rcu_node *rnp;
if (t->rcu_read_lock_nesting > 0 &&
- (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+ !t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
/*
@@ -217,7 +175,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* But first, note that the current CPU must still be
* on line!
*/
- WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -238,7 +196,7 @@ static void rcu_preempt_note_context_switch(int cpu)
: rnp->gpnum + 1);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special) {
+ t->rcu_read_unlock_special.s) {
/*
* Complete exit from RCU read-side critical section on
@@ -256,9 +214,7 @@ static void rcu_preempt_note_context_switch(int cpu)
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
- local_irq_save(flags);
- rcu_preempt_qs(cpu);
- local_irq_restore(flags);
+ rcu_preempt_qs();
}
/*
@@ -272,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period. The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
- struct rcu_node *rnp_p;
-
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return; /* Still need more quiescent states! */
- }
-
- rnp_p = rnp->parent;
- if (rnp_p == NULL) {
- /*
- * Either there is only one rcu_node in the tree,
- * or tasks were kicked up to root rcu_node due to
- * CPUs going offline.
- */
- rcu_report_qs_rsp(&rcu_preempt_state, flags);
- return;
- }
-
- /* Report up the rest of the hierarchy. */
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
-/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
@@ -324,22 +243,31 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
}
/*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+ return !list_empty(&rnp->blkd_tasks);
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
void rcu_read_unlock_special(struct task_struct *t)
{
- int empty;
- int empty_exp;
- int empty_exp_now;
+ bool empty_exp;
+ bool empty_norm;
+ bool empty_exp_now;
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rbmp = NULL;
+ bool drop_boost_mutex = false;
#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
- int special;
+ union rcu_special special;
/* NMI handlers cannot block and cannot safely manipulate state. */
if (in_nmi())
@@ -349,26 +277,34 @@ void rcu_read_unlock_special(struct task_struct *t)
/*
* If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so.
+ * let it know that we have done so. Because irqs are disabled,
+ * t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- if (special & RCU_READ_UNLOCK_NEED_QS) {
- rcu_preempt_qs(smp_processor_id());
- if (!t->rcu_read_unlock_special) {
+ if (special.b.need_qs) {
+ rcu_preempt_qs();
+ t->rcu_read_unlock_special.b.need_qs = false;
+ if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
}
}
/* Hardware IRQ handlers cannot block, complain if they get here. */
- if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+ if (in_irq() || in_serving_softirq()) {
+ lockdep_rcu_suspicious(__FILE__, __LINE__,
+ "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+ pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+ t->rcu_read_unlock_special.s,
+ t->rcu_read_unlock_special.b.blocked,
+ t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags);
return;
}
/* Clean up if blocked during RCU read-side critical section. */
- if (special & RCU_READ_UNLOCK_BLOCKED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+ if (special.b.blocked) {
+ t->rcu_read_unlock_special.b.blocked = false;
/*
* Remove this task from the list it blocked on. The
@@ -383,7 +319,7 @@ void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
@@ -398,11 +334,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
- /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
- if (t->rcu_boost_mutex) {
- rbmp = t->rcu_boost_mutex;
- t->rcu_boost_mutex = NULL;
- }
+ /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -412,7 +345,7 @@ void rcu_read_unlock_special(struct task_struct *t)
* so we must take a snapshot of the expedited state.
*/
empty_exp_now = !rcu_preempted_readers_exp(rnp);
- if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
0, rnp->qsmask,
@@ -420,15 +353,16 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(rnp, flags);
+ rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+ rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (rbmp)
- rt_mutex_unlock(rbmp);
+ if (drop_boost_mutex)
+ rt_mutex_unlock(&rnp->boost_mtx);
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -442,8 +376,6 @@ void rcu_read_unlock_special(struct task_struct *t)
}
}
-#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
-
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period on the specified rcu_node structure.
@@ -478,14 +410,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
#ifdef CONFIG_RCU_CPU_STALL_INFO
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -546,101 +470,11 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (!list_empty(&rnp->blkd_tasks))
+ if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline. Move them up to the root
- * rcu_node. The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- struct list_head *lp;
- struct list_head *lp_root;
- int retval = 0;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *t;
-
- if (rnp == rnp_root) {
- WARN_ONCE(1, "Last CPU thought to be offlined?");
- return 0; /* Shouldn't happen: at least one CPU online. */
- }
-
- /* If we are on an internal node, complain bitterly. */
- WARN_ON_ONCE(rnp != rdp->mynode);
-
- /*
- * Move tasks up to root rcu_node. Don't try to get fancy for
- * this corner-case operation -- just put this node's tasks
- * at the head of the root node's list, and update the root node's
- * ->gp_tasks and ->exp_tasks pointers to those of this node's,
- * if non-NULL. This might result in waiting for more tasks than
- * absolutely necessary, but this is a good performance/complexity
- * tradeoff.
- */
- if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
- retval |= RCU_OFL_TASKS_NORM_GP;
- if (rcu_preempted_readers_exp(rnp))
- retval |= RCU_OFL_TASKS_EXP_GP;
- lp = &rnp->blkd_tasks;
- lp_root = &rnp_root->blkd_tasks;
- while (!list_empty(lp)) {
- t = list_entry(lp->next, typeof(*t), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- list_del(&t->rcu_node_entry);
- t->rcu_blocked_node = rnp_root;
- list_add(&t->rcu_node_entry, lp_root);
- if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp_root->gp_tasks = rnp->gp_tasks;
- if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
- }
-
- rnp->gp_tasks = NULL;
- rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
- rnp->boost_tasks = NULL;
- /*
- * In case root is being boosted and leaf was not. Make sure
- * that we boost the tasks blocking the current grace period
- * in this case.
- */
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- if (rnp_root->boost_tasks != NULL &&
- rnp_root->boost_tasks != rnp_root->gp_tasks &&
- rnp_root->boost_tasks != rnp_root->exp_tasks)
- rnp_root->boost_tasks = rnp_root->gp_tasks;
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- return retval;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the corresponding CPU's rcu_node structure,
@@ -648,17 +482,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
*
* Caller must disable hard irqs.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
struct task_struct *t = current;
if (t->rcu_read_lock_nesting == 0) {
- rcu_preempt_qs(cpu);
+ rcu_preempt_qs();
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- per_cpu(rcu_preempt_data, cpu).qs_pending)
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+ __this_cpu_read(rcu_preempt_data.qs_pending) &&
+ !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ t->rcu_read_unlock_special.b.need_qs = true;
}
#ifdef CONFIG_RCU_BOOST
@@ -701,7 +536,7 @@ void synchronize_rcu(void)
"Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
wait_rcu_gp(call_rcu);
@@ -746,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
* Caller must hold sync_rcu_preempt_exp_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -783,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure. If there are no such
- * tasks, report it up the rcu_node hierarchy.
+ * grace period for the specified rcu_node structure, phase 1. If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
*
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
- * CPU hotplug operations.
+ * Caller must hold sync_rcu_preempt_exp_mutex.
*/
static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
{
unsigned long flags;
- int must_wait = 0;
+ unsigned long mask;
+ struct rcu_node *rnp_up;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (list_empty(&rnp->blkd_tasks)) {
+ WARN_ON_ONCE(rnp->expmask);
+ WARN_ON_ONCE(rnp->exp_tasks);
+ if (!rcu_preempt_has_tasks(rnp)) {
+ /* No blocked tasks, nothing to do. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else {
+ return;
+ }
+ /* Call for Phase 2 and propagate ->expmask bits up the tree. */
+ rnp->expmask = 1;
+ rnp_up = rnp;
+ while (rnp_up->parent) {
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ if (rnp_up->expmask & mask)
+ break;
+ raw_spin_lock(&rnp_up->lock); /* irqs already off */
+ smp_mb__after_unlock_lock();
+ rnp_up->expmask |= mask;
+ raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2. If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting. Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rnp->expmask) {
+ /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+
+ /* Phase 1 is over. */
+ rnp->expmask = 0;
+
+ /*
+ * If there are still blocked tasks, set up ->exp_tasks so that
+ * rcu_read_unlock_special() will wake us and then boost them.
+ */
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->exp_tasks = rnp->blkd_tasks.next;
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- must_wait = 1;
+ return;
}
- if (!must_wait)
- rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+
+ /* No longer any blocked tasks, so undo bit setting. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_report_exp_rnp(rsp, rnp, false);
}
/**
@@ -819,15 +707,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
* In fact, if you are using synchronize_rcu_expedited() in a loop,
* please restructure your code to batch your updates, and then Use a
* single synchronize_rcu() instead.
- *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock.
*/
void synchronize_rcu_expedited(void)
{
- unsigned long flags;
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_preempt_state;
unsigned long snap;
@@ -845,7 +727,11 @@ void synchronize_rcu_expedited(void)
* being boosted. This simplifies the process of moving tasks
* from leaf to root rcu_node structures.
*/
- get_online_cpus();
+ if (!try_get_online_cpus()) {
+ /* CPU-hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu);
+ return;
+ }
/*
* Acquire lock, falling back to synchronize_rcu() if too many
@@ -874,19 +760,16 @@ void synchronize_rcu_expedited(void)
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
- /* Initialize ->expmask for all non-leaf rcu_node structures. */
- rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rnp->expmask = rnp->qsmaskinit;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
-
- /* Snapshot current state of ->blkd_tasks lists. */
+ /*
+ * Snapshot current state of ->blkd_tasks lists into ->expmask.
+ * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+ * to start clearing them. Doing this in one phase leads to
+ * strange races between setting and clearing bits, so just say "no"!
+ */
rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init(rsp, rnp);
- if (NUM_RCU_NODES > 1)
- sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+ sync_rcu_preempt_exp_init1(rsp, rnp);
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init2(rsp, rnp);
put_online_cpus();
@@ -897,7 +780,8 @@ void synchronize_rcu_expedited(void)
/* Clean up and exit. */
smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+ ACCESS_ONCE(sync_rcu_preempt_exp_count) =
+ sync_rcu_preempt_exp_count + 1;
unlock_mb_ret:
mutex_unlock(&sync_rcu_preempt_exp_mutex);
mb_ret:
@@ -941,11 +825,11 @@ void exit_rcu(void)
return;
t->rcu_read_lock_nesting = 1;
barrier();
- t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_read_unlock_special.b.blocked = true;
__rcu_read_unlock();
}
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *rcu_state_p = &rcu_sched_state;
@@ -959,19 +843,10 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
}
@@ -984,16 +859,14 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return false;
}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
@@ -1021,28 +894,11 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- return 0;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
}
@@ -1056,20 +912,6 @@ void synchronize_rcu_expedited(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
@@ -1095,7 +937,7 @@ void exit_rcu(void)
{
}
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
@@ -1105,7 +947,7 @@ void exit_rcu(void)
static void rcu_initiate_boost_trace(struct rcu_node *rnp)
{
- if (list_empty(&rnp->blkd_tasks))
+ if (!rcu_preempt_has_tasks(rnp))
rnp->n_balk_blkd_tasks++;
else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
rnp->n_balk_exp_gp_tasks++;
@@ -1149,11 +991,11 @@ static void rcu_wake_cond(struct task_struct *t, int status)
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
- struct rt_mutex mtx;
struct task_struct *t;
struct list_head *tb;
- if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+ ACCESS_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1200,11 +1042,11 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+ /* Lock only for side effect: boosts task t's priority. */
+ rt_mutex_lock(&rnp->boost_mtx);
+ rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1098,7 @@ static int rcu_boost_kthread(void *arg)
* about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
struct task_struct *t;
@@ -1323,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
+ struct rcu_node *rnp)
{
int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
@@ -1333,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
if (&rcu_preempt_state != rsp)
return 0;
- if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
return 0;
rsp->boost = 1;
@@ -1347,7 +1190,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
smp_mb__after_unlock_lock();
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_BOOST_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
@@ -1364,7 +1207,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
@@ -1426,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rnp->qsmaskinit;
+ unsigned long mask = rcu_rnp_online_cpus(rnp);
cpumask_var_t cm;
int cpu;
@@ -1437,12 +1280,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0) {
+ if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
- cpumask_clear_cpu(cpu, cm);
- WARN_ON_ONCE(cpumask_weight(cm) == 0);
- }
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
@@ -1457,26 +1296,19 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = {
};
/*
- * Spawn all kthreads -- called as soon as the scheduler is running.
+ * Spawn boost kthreads -- called as soon as the scheduler is running.
*/
-static int __init rcu_spawn_kthreads(void)
+static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
int cpu;
- rcu_scheduler_fully_active = 1;
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rnp = rcu_get_root(rcu_state_p);
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state_p, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- }
- return 0;
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
-early_initcall(rcu_spawn_kthreads);
static void rcu_prepare_kthreads(int cpu)
{
@@ -1491,6 +1323,7 @@ static void rcu_prepare_kthreads(int cpu)
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1513,12 +1346,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static int __init rcu_scheduler_really_started(void)
+static void __init rcu_spawn_boost_kthreads(void)
{
- rcu_scheduler_fully_active = 1;
- return 0;
}
-early_initcall(rcu_scheduler_really_started);
static void rcu_prepare_kthreads(int cpu)
{
@@ -1538,10 +1368,10 @@ static void rcu_prepare_kthreads(int cpu)
* any flavor of RCU.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu, NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1549,7 +1379,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
* after it.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
}
@@ -1557,7 +1387,7 @@ static void rcu_cleanup_after_idle(int cpu)
* Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
* is nothing.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
}
@@ -1619,7 +1449,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
/* Exit early if we advanced recently. */
if (jiffies == rdtp->last_advance_all)
- return 0;
+ return false;
rdtp->last_advance_all = jiffies;
for_each_rcu_flavor(rsp) {
@@ -1631,7 +1461,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* completed since we last checked and there are
* callbacks not yet ready to invoke.
*/
- if (rdp->completed != rnp->completed &&
+ if ((rdp->completed != rnp->completed ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1650,15 +1481,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* The caller must have disabled interrupts.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *dj)
+int rcu_needs_cpu(unsigned long *dj)
{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
*dj = ULONG_MAX;
return 0;
}
@@ -1692,12 +1523,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*
* The caller must have disabled interrupts.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
@@ -1705,7 +1536,7 @@ static void rcu_prepare_for_idle(int cpu)
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu, NULL))
+ if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1714,7 +1545,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/*
@@ -1738,7 +1569,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
@@ -1757,10 +1588,10 @@ static void rcu_prepare_for_idle(int cpu)
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1842,7 +1673,7 @@ static int rcu_oom_notify(struct notifier_block *self,
get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched();
+ cond_resched_rcu_qs();
}
put_online_cpus();
@@ -1924,11 +1755,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+ pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -2060,6 +1892,68 @@ bool rcu_is_nocb_cpu(int cpu)
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+ if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ return;
+ if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ /* Prior smp_mb__after_atomic() orders against prior enqueue. */
+ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ wake_up(&rdp_leader->nocb_wq);
+ }
+}
+
+/*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
+ struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+ /*
+ * Check count of all no-CBs callbacks awaiting invocation.
+ * There needs to be a barrier before this function is called,
+ * but associated with a prior determination that no more
+ * callbacks would be posted. In the worst case, the first
+ * barrier in _rcu_barrier() suffices (but the caller cannot
+ * necessarily rely on this, not a substitute for the caller
+ * getting the concurrency design right!). There must also be
+ * a barrier between the following load an posting of a callback
+ * (if a callback is in fact needed). This is associated with an
+ * atomic_inc() in the caller.
+ */
+ ret = atomic_long_read(&rdp->nocb_q_count);
+
+#ifdef CONFIG_PROVE_RCU
+ rhp = ACCESS_ONCE(rdp->nocb_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+
+ /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+ if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ rcu_scheduler_fully_active) {
+ /* RCU callback enqueued before CPU first came online??? */
+ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+ cpu, rhp->func);
+ WARN_ON_ONCE(1);
+ }
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+ return !!ret;
+}
+
+/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2078,10 +1972,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
struct task_struct *t;
/* Enqueue the callback on the nocb list and update counts. */
+ atomic_long_add(rhcount, &rdp->nocb_q_count);
+ /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
ACCESS_ONCE(*old_rhpp) = rhp;
- atomic_long_add(rhcount, &rdp->nocb_q_count);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+ smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
t = ACCESS_ONCE(rdp->nocb_kthread);
@@ -2093,19 +1989,28 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
- wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ /* ... if queue was empty ... */
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- rdp->nocb_defer_wakeup = true;
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
- wake_up_process(t); /* ... or if many callbacks queued. */
+ /* ... or if many callbacks queued. */
+ if (!irqs_disabled_flags(flags)) {
+ wake_nocb_leader(rdp, true);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvf"));
+ } else {
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvfIsDeferred"));
+ }
rdp->qlen_last_fqs_check = LONG_MAX / 2;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
} else {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
}
@@ -2126,7 +2031,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
{
if (!rcu_is_nocb_cpu(rdp->cpu))
- return 0;
+ return false;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
@@ -2137,7 +2042,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_callback(rdp->rsp->name, rhp,
-atomic_long_read(&rdp->nocb_q_count_lazy),
-atomic_long_read(&rdp->nocb_q_count));
- return 1;
+
+ /*
+ * If called from an extended quiescent state with interrupts
+ * disabled, invoke the RCU core in order to allow the idle-entry
+ * deferred-wakeup check to function.
+ */
+ if (irqs_disabled_flags(flags) &&
+ !rcu_is_watching() &&
+ cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ return true;
}
/*
@@ -2153,7 +2069,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return 0;
+ return false;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
@@ -2172,7 +2088,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
rsp->orphan_nxtlist = NULL;
rsp->orphan_nxttail = &rsp->orphan_nxtlist;
}
- return 1;
+ return true;
}
/*
@@ -2205,7 +2121,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
if (likely(d))
break;
- flush_signals(current);
+ WARN_ON(signal_pending(current));
trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
}
trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
@@ -2213,13 +2129,145 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
}
/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+ bool firsttime = true;
+ bool gotcbs;
+ struct rcu_data *rdp;
+ struct rcu_head **tail;
+
+wait_again:
+
+ /* Wait for callbacks to appear. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ wait_event_interruptible(my_rdp->nocb_wq,
+ !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ /* Memory barrier handled by smp_mb() calls below and repoll. */
+ } else if (firsttime) {
+ firsttime = false; /* Don't drown trace log with "Poll"! */
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ }
+
+ /*
+ * Each pass through the following loop checks a follower for CBs.
+ * We are our own first follower. Any CBs found are moved to
+ * nocb_gp_head, where they await a grace period.
+ */
+ gotcbs = false;
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs here, try next follower. */
+
+ /* Move callbacks to wait-for-GP list, which is empty. */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ gotcbs = true;
+ }
+
+ /*
+ * If there were no callbacks, sleep a bit, rescan after a
+ * memory barrier, and go retry.
+ */
+ if (unlikely(!gotcbs)) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ "WokeEmpty");
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(1);
+
+ /* Rescan in case we were a victim of memory ordering. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+ if (ACCESS_ONCE(rdp->nocb_head)) {
+ /* Found CB, so short-circuit next wait. */
+ my_rdp->nocb_leader_sleep = false;
+ break;
+ }
+ goto wait_again;
+ }
+
+ /* Wait for one grace period. */
+ rcu_nocb_wait_gp(my_rdp);
+
+ /*
+ * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+ * We set it now, but recheck for new callbacks while
+ * traversing our follower list.
+ */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
+
+ /* Each pass through the following loop wakes a follower, if needed. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ if (ACCESS_ONCE(rdp->nocb_head))
+ my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs, so no need to wake follower. */
+
+ /* Append callbacks to follower's "done" list. */
+ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ *tail = rdp->nocb_gp_head;
+ smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+ /*
+ * List was empty, wake up the follower.
+ * Memory barriers supplied by atomic_long_add().
+ */
+ wake_up(&rdp->nocb_wq);
+ }
+ }
+
+ /* If we (the leader) don't have CBs, go wait some more. */
+ if (!my_rdp->nocb_follower_head)
+ goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+ bool firsttime = true;
+
+ for (;;) {
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "FollowerSleep");
+ wait_event_interruptible(rdp->nocb_wq,
+ ACCESS_ONCE(rdp->nocb_follower_head));
+ } else if (firsttime) {
+ /* Don't drown trace log with "Poll"! */
+ firsttime = false;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+ }
+ if (smp_load_acquire(&rdp->nocb_follower_head)) {
+ /* ^^^ Ensure CB invocation follows _head test. */
+ return;
+ }
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "WokeEmpty");
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(1);
+ }
+}
+
+/*
* Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
*/
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
- bool firsttime = 1;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2227,44 +2275,23 @@ static int rcu_nocb_kthread(void *arg)
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
- /* If not polling, wait for next batch of callbacks. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Sleep"));
- wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
- /* Memory barrier provide by xchg() below. */
- } else if (firsttime) {
- firsttime = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Poll"));
- }
- list = ACCESS_ONCE(rdp->nocb_head);
- if (!list) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeEmpty"));
- schedule_timeout_interruptible(1);
- flush_signals(current);
- continue;
- }
- firsttime = 1;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeNonEmpty"));
+ /* Wait for callbacks. */
+ if (rdp->nocb_leader == rdp)
+ nocb_leader_wait(rdp);
+ else
+ nocb_follower_wait(rdp);
- /*
- * Extract queued callbacks, update counts, and wait
- * for a grace period to elapse.
- */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
- tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- c = atomic_long_xchg(&rdp->nocb_q_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
- ACCESS_ONCE(rdp->nocb_p_count) += c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- rcu_nocb_wait_gp(rdp);
+ /* Pull the ready-to-invoke callbacks onto local list. */
+ list = ACCESS_ONCE(rdp->nocb_follower_head);
+ BUG_ON(!list);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+ ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
/* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+ trace_rcu_batch_start(rdp->rsp->name,
+ atomic_long_read(&rdp->nocb_q_count_lazy),
+ atomic_long_read(&rdp->nocb_q_count), -1);
c = cl = 0;
while (list) {
next = list->next;
@@ -2286,15 +2313,16 @@ static int rcu_nocb_kthread(void *arg)
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
- ACCESS_ONCE(rdp->nocb_p_count) -= c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
+ smp_mb__before_atomic(); /* _add after CB invocation. */
+ atomic_long_add(-c, &rdp->nocb_q_count);
+ atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
rdp->n_nocbs_invoked += c;
}
return 0;
}
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return ACCESS_ONCE(rdp->nocb_defer_wakeup);
}
@@ -2302,11 +2330,69 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
/* Do a deferred wakeup of rcu_nocb_kthread(). */
static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ int ndw;
+
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_up(&rdp->nocb_wq);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+ ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
+}
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = true;
+ struct rcu_state *rsp;
+
+#ifdef CONFIG_RCU_NOCB_CPU_NONE
+ need_rcu_nocb_mask = false;
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ have_rcu_nocb_mask = true;
+ }
+ if (!have_rcu_nocb_mask)
+ return;
+
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tOffload RCU callbacks from CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tOffload RCU callbacks from all CPUs\n");
+ cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_rcu_flavor(rsp) {
+ for_each_cpu(cpu, rcu_nocb_mask)
+ init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
+ rcu_organize_nocb_kthreads(rsp);
+ }
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
@@ -2314,38 +2400,155 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
+ * brought online out of order, this can require re-organizing the
+ * leader-follower relationships.
+ */
+static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
{
- int cpu;
struct rcu_data *rdp;
+ struct rcu_data *rdp_last;
+ struct rcu_data *rdp_old_leader;
+ struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
struct task_struct *t;
- if (rcu_nocb_mask == NULL)
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
return;
+
+ /* If we didn't spawn the leader first, reorganize! */
+ rdp_old_leader = rdp_spawn->nocb_leader;
+ if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
+ rdp_last = NULL;
+ rdp = rdp_old_leader;
+ do {
+ rdp->nocb_leader = rdp_spawn;
+ if (rdp_last && rdp != rdp_spawn)
+ rdp_last->nocb_next_follower = rdp;
+ if (rdp == rdp_spawn) {
+ rdp = rdp->nocb_next_follower;
+ } else {
+ rdp_last = rdp;
+ rdp = rdp->nocb_next_follower;
+ rdp_last->nocb_next_follower = NULL;
+ }
+ } while (rdp);
+ rdp_spawn->nocb_next_follower = rdp_old_leader;
+ }
+
+ /* Spawn the kthread for this CPU and RCU flavor. */
+ t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ "rcuo%c/%d", rsp->abbr, cpu);
+ BUG_ON(IS_ERR(t));
+ ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthreads, spawn them.
+ */
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+ struct rcu_state *rsp;
+
+ if (rcu_scheduler_fully_active)
+ for_each_rcu_flavor(rsp)
+ rcu_spawn_one_nocb_kthread(rsp, cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_all_nocb_kthreads(cpu);
+}
+
+/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Initialize leader-follower relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
+{
+ int cpu;
+ int ls = rcu_nocb_leader_stride;
+ int nl = 0; /* Next leader. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
+
+ if (!have_rcu_nocb_mask)
+ return;
+ if (ls == -1) {
+ ls = int_sqrt(nr_cpu_ids);
+ rcu_nocb_leader_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure and
+ * spawns one rcu_nocb_kthread().
+ */
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- t = kthread_run(rcu_nocb_kthread, rdp,
- "rcuo%c/%d", rsp->abbr, cpu);
- BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp->nocb_kthread) = t;
+ if (rdp->cpu >= nl) {
+ /* New leader, set up for followers & next leader. */
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_leader = rdp;
+ rdp_leader = rdp;
+ } else {
+ /* Another follower, link to previous leader. */
+ rdp->nocb_leader = rdp_leader;
+ rdp_prev->nocb_next_follower = rdp;
+ }
+ rdp_prev = rdp;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
static bool init_nocb_callback_list(struct rcu_data *rdp)
{
- if (rcu_nocb_mask == NULL ||
- !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
+ if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
+
+ /* If there are early-boot callbacks, move them to nocb lists. */
+ if (rdp->nxtlist) {
+ rdp->nocb_head = rdp->nxtlist;
+ rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+ atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+ atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+ rdp->nxtlist = NULL;
+ rdp->qlen = 0;
+ rdp->qlen_lazy = 0;
+ }
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
return true;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ WARN_ON_ONCE(1); /* Should be dead code. */
+ return false;
+}
+
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2361,21 +2564,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags)
{
- return 0;
+ return false;
}
static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp,
unsigned long flags)
{
- return 0;
+ return false;
}
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
return false;
}
@@ -2384,7 +2587,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
{
}
@@ -2415,16 +2622,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-/*
- * Define RCU flavor that holds sysidle state. This needs to be the
- * most active flavor of RCU.
- */
-#ifdef CONFIG_PREEMPT_RCU
-static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
static int full_sysidle_state; /* Current system-idle state. */
#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
@@ -2438,9 +2635,14 @@ static int full_sysidle_state; /* Current system-idle state. */
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
unsigned long j;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
/* Adjust nesting, check for fully idle. */
if (irq) {
@@ -2505,8 +2707,14 @@ void rcu_sysidle_force_exit(void)
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
/* Adjust nesting, check for already non-idle. */
if (irq) {
rdtp->dynticks_idle_nesting++;
@@ -2552,7 +2760,8 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
/*
* Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts.
+ * does not count as idle. The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
*/
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
@@ -2561,16 +2770,20 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long j;
struct rcu_dynticks *rdtp = rdp->dynticks;
+ /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
+ if (!tick_nohz_full_enabled())
+ return;
+
/*
* If some other CPU has already reported non-idle, if this is
* not the flavor of RCU that tracks sysidle state, or if this
* is an offline or the timekeeping CPU, nothing to do.
*/
- if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+ if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
- if (rcu_gp_in_progress(rdp->rsp))
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+ /* Verify affinity of current kthread. */
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
/* Pick up current idle and NMI-nesting counter and check. */
cur = atomic_read(&rdtp->dynticks_idle);
@@ -2592,7 +2805,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
*/
static bool is_sysidle_rcu_state(struct rcu_state *rsp)
{
- return rsp == rcu_sysidle_state;
+ return rsp == rcu_state_p;
}
/*
@@ -2670,7 +2883,7 @@ static void rcu_sysidle_cancel(void)
static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
unsigned long maxj, bool gpkt)
{
- if (rsp != rcu_sysidle_state)
+ if (rsp != rcu_state_p)
return; /* Wrong flavor, ignore. */
if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
return; /* Running state machine from timekeeping CPU. */
@@ -2687,6 +2900,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
rcu_sysidle_report(rsp, isidle, maxj, true);
}
@@ -2713,7 +2930,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
/*
* Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts.
+ * The caller must have disabled interrupts. This is not intended to be
+ * called unless tick_nohz_full_enabled().
*/
bool rcu_sys_is_idle(void)
{
@@ -2739,13 +2957,12 @@ bool rcu_sys_is_idle(void)
/* Scan all the CPUs looking for nonidle CPUs. */
for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+ rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
if (!isidle)
break;
}
- rcu_sysidle_report(rcu_sysidle_state,
- isidle, maxj, false);
+ rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
rss = ACCESS_ONCE(full_sysidle_state);
}
@@ -2772,7 +2989,7 @@ bool rcu_sys_is_idle(void)
* provided by the memory allocator.
*/
if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_sysidle_state) &&
+ !rcu_gp_in_progress(rcu_state_p) &&
!rsh.inuse && xchg(&rsh.inuse, 1) == 0)
call_rcu(&rsh.rh, rcu_sysidle_cb);
return false;
@@ -2788,11 +3005,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
}
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
}
@@ -2843,12 +3060,31 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
*/
static void rcu_bind_gp_kthread(void)
{
-#ifdef CONFIG_NO_HZ_FULL
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+ int __maybe_unused cpu;
- if (cpu < 0 || cpu >= nr_cpu_ids)
+ if (!tick_nohz_full_enabled())
return;
- if (raw_smp_processor_id() != cpu)
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ cpu = tick_do_timer_cpu;
+ if (cpu >= 0 && cpu < nr_cpu_ids)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+}
+
+/* Record the current task on dyntick-idle entry. */
+static void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
#define RCU_TREE_NONCORE
#include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->passed_quiesce, rdp->qs_pending);
+ rdp->passed_quiesce,
+ rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
@@ -279,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_puts(m, "\n");
level = rnp->level;
}
- seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit,
+ seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
+ rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
".G"[rnp->gp_tasks != NULL],
".E"[rnp->exp_tasks != NULL],
".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index bc7883570530..1f133350da01 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,8 @@
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/tick.h>
#define CREATE_TRACE_POINTS
@@ -60,6 +62,63 @@ MODULE_ALIAS("rcupdate");
module_param(rcu_expedited, int, 0);
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting =
+ ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+
+/*
+ * Should normal grace-period primitives be expedited? Intended for
+ * use within RCU. Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+ atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp(). If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+ atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+ rcu_unexpedite_gp();
+}
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -90,11 +149,8 @@ void __rcu_read_unlock(void)
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
- udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
@@ -140,6 +196,38 @@ int notrace debug_lockdep_rcu_enabled(void)
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
/**
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
+ * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
+ * this assumes we are in an RCU read-side critical section unless it can
+ * prove otherwise. This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
+ */
+int rcu_read_lock_held(void)
+{
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ return lock_is_held(&rcu_lock_map);
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_held);
+
+/**
* rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
* Check for bottom half being disabled, which covers both the
@@ -168,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
*/
-static void wakeme_after_rcu(struct rcu_head *head)
+void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
@@ -275,7 +360,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -350,3 +435,397 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle. As such, grace periods can take one good
+ * long time. There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like. Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs. If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Track exiting tasks in order to allow them to be waited for. */
+DEFINE_SRCU(tasks_rcu_exit_srcu);
+
+/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+module_param(rcu_task_stall_timeout, int, 0644);
+
+static void rcu_spawn_tasks_kthread(void);
+
+/*
+ * Post an RCU-tasks callback. First call must be from process context
+ * after the scheduler if fully operational.
+ */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+ unsigned long flags;
+ bool needwake;
+
+ rhp->next = NULL;
+ rhp->func = func;
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ needwake = !rcu_tasks_cbs_head;
+ *rcu_tasks_cbs_tail = rhp;
+ rcu_tasks_cbs_tail = &rhp->next;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+ if (needwake) {
+ rcu_spawn_tasks_kthread();
+ wake_up(&rcu_tasks_cbs_wq);
+ }
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/**
+ * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed. These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function
+ * preambles and profiling hooks. The synchronize_rcu_tasks() function
+ * is not (yet) intended for heavy use from multiple CPUs.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-tasks read-side critical section whose beginning
+ * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
+ * having an RCU-tasks read-side critical section that extends beyond
+ * the return from synchronize_rcu_tasks() is guaranteed to have executed
+ * a full memory barrier after the beginning of synchronize_rcu_tasks()
+ * and before the beginning of that RCU-tasks read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
+ * (but again only if the system has more than one CPU).
+ */
+void synchronize_rcu_tasks(void)
+{
+ /* Complain if the scheduler has not started. */
+ rcu_lockdep_assert(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
+
+ /* Wait for the grace period. */
+ wait_rcu_gp(call_rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+
+/**
+ * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks(void)
+{
+ /* There is only one callback queue, so this is easy. ;-) */
+ synchronize_rcu_tasks();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+
+/* See if tasks are still holding out, complain if so. */
+static void check_holdout_task(struct task_struct *t,
+ bool needreport, bool *firstreport)
+{
+ int cpu;
+
+ if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+ !ACCESS_ONCE(t->on_rq) ||
+ (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
+ !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ list_del_init(&t->rcu_tasks_holdout_list);
+ put_task_struct(t);
+ return;
+ }
+ if (!needreport)
+ return;
+ if (*firstreport) {
+ pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
+ *firstreport = false;
+ }
+ cpu = task_cpu(t);
+ pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
+ t, ".I"[is_idle_task(t)],
+ "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
+ t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
+ t->rcu_tasks_idle_cpu, cpu);
+ sched_show_task(t);
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+ unsigned long lastreport;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ LIST_HEAD(rcu_tasks_holdouts);
+
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current);
+
+ /*
+ * Each pass through the following loop makes one check for
+ * newly arrived callbacks, and, if there are some, waits for
+ * one RCU-tasks grace period and then invokes the callbacks.
+ * This loop is terminated by the system going down. ;-)
+ */
+ for (;;) {
+
+ /* Pick up any new callbacks. */
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ list = rcu_tasks_cbs_head;
+ rcu_tasks_cbs_head = NULL;
+ rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+ /* If there were none, wait a bit and start over. */
+ if (!list) {
+ wait_event_interruptible(rcu_tasks_cbs_wq,
+ rcu_tasks_cbs_head);
+ if (!rcu_tasks_cbs_head) {
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(HZ/10);
+ }
+ continue;
+ }
+
+ /*
+ * Wait for all pre-existing t->on_rq and t->nvcsw
+ * transitions to complete. Invoking synchronize_sched()
+ * suffices because all these transitions occur with
+ * interrupts disabled. Without this synchronize_sched(),
+ * a read-side critical section that started before the
+ * grace period might be incorrectly seen as having started
+ * after the grace period.
+ *
+ * This synchronize_sched() also dispenses with the
+ * need for a memory barrier on the first store to
+ * ->rcu_tasks_holdout, as it forces the store to happen
+ * after the beginning of the grace period.
+ */
+ synchronize_sched();
+
+ /*
+ * There were callbacks, so we need to wait for an
+ * RCU-tasks grace period. Start off by scanning
+ * the task list for tasks that are not already
+ * voluntarily blocked. Mark these tasks and make
+ * a list of them in rcu_tasks_holdouts.
+ */
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
+ if (t != current && ACCESS_ONCE(t->on_rq) &&
+ !is_idle_task(t)) {
+ get_task_struct(t);
+ t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+ ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ list_add(&t->rcu_tasks_holdout_list,
+ &rcu_tasks_holdouts);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Wait for tasks that are in the process of exiting.
+ * This does only part of the job, ensuring that all
+ * tasks that were previously exiting reach the point
+ * where they have disabled preemption, allowing the
+ * later synchronize_sched() to finish the job.
+ */
+ synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ /*
+ * Each pass through the following loop scans the list
+ * of holdout tasks, removing any that are no longer
+ * holdouts. When the list is empty, we are done.
+ */
+ lastreport = jiffies;
+ while (!list_empty(&rcu_tasks_holdouts)) {
+ bool firstreport;
+ bool needreport;
+ int rtst;
+ struct task_struct *t1;
+
+ schedule_timeout_interruptible(HZ);
+ rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ needreport = rtst > 0 &&
+ time_after(jiffies, lastreport + rtst);
+ if (needreport)
+ lastreport = jiffies;
+ firstreport = true;
+ WARN_ON(signal_pending(current));
+ list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
+ rcu_tasks_holdout_list) {
+ check_holdout_task(t, needreport, &firstreport);
+ cond_resched();
+ }
+ }
+
+ /*
+ * Because ->on_rq and ->nvcsw are not guaranteed
+ * to have a full memory barriers prior to them in the
+ * schedule() path, memory reordering on other CPUs could
+ * cause their RCU-tasks read-side critical sections to
+ * extend past the end of the grace period. However,
+ * because these ->nvcsw updates are carried out with
+ * interrupts disabled, we can use synchronize_sched()
+ * to force the needed ordering on all such CPUs.
+ *
+ * This synchronize_sched() also confines all
+ * ->rcu_tasks_holdout accesses to be within the grace
+ * period, avoiding the need for memory barriers for
+ * ->rcu_tasks_holdout accesses.
+ *
+ * In addition, this synchronize_sched() waits for exiting
+ * tasks to complete their final preempt_disable() region
+ * of execution, cleaning up after the synchronize_srcu()
+ * above.
+ */
+ synchronize_sched();
+
+ /* Invoke the callbacks. */
+ while (list) {
+ next = list->next;
+ local_bh_disable();
+ list->func(list);
+ local_bh_enable();
+ list = next;
+ cond_resched();
+ }
+ schedule_timeout_uninterruptible(HZ/10);
+ }
+}
+
+/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
+static void rcu_spawn_tasks_kthread(void)
+{
+ static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
+ static struct task_struct *rcu_tasks_kthread_ptr;
+ struct task_struct *t;
+
+ if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ smp_mb(); /* Ensure caller sees full kthread. */
+ return;
+ }
+ mutex_lock(&rcu_tasks_kthread_mutex);
+ if (rcu_tasks_kthread_ptr) {
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+ return;
+ }
+ t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+ BUG_ON(IS_ERR(t));
+ smp_mb(); /* Ensure others see full kthread. */
+ ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+}
+
+#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Early boot self test parameters, one for each flavor
+ */
+static bool rcu_self_test;
+static bool rcu_self_test_bh;
+static bool rcu_self_test_sched;
+
+module_param(rcu_self_test, bool, 0444);
+module_param(rcu_self_test_bh, bool, 0444);
+module_param(rcu_self_test_sched, bool, 0444);
+
+static int rcu_self_test_counter;
+
+static void test_callback(struct rcu_head *r)
+{
+ rcu_self_test_counter++;
+ pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
+}
+
+static void early_boot_test_call_rcu(void)
+{
+ static struct rcu_head head;
+
+ call_rcu(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_bh(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_bh(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_sched(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_sched(&head, test_callback);
+}
+
+void rcu_early_boot_tests(void)
+{
+ pr_info("Running RCU self tests\n");
+
+ if (rcu_self_test)
+ early_boot_test_call_rcu();
+ if (rcu_self_test_bh)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
+}
+
+static int rcu_verify_early_boot_tests(void)
+{
+ int ret = 0;
+ int early_boot_test_counter = 0;
+
+ if (rcu_self_test) {
+ early_boot_test_counter++;
+ rcu_barrier();
+ }
+ if (rcu_self_test_bh) {
+ early_boot_test_counter++;
+ rcu_barrier_bh();
+ }
+ if (rcu_self_test_sched) {
+ early_boot_test_counter++;
+ rcu_barrier_sched();
+ }
+
+ if (rcu_self_test_counter != early_boot_test_counter) {
+ WARN_ON(1);
+ ret = -1;
+ }
+
+ return ret;
+}
+late_initcall(rcu_verify_early_boot_tests);
+#else
+void rcu_early_boot_tests(void) {}
+#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+
+/**
+ * register_restart_handler - Register function to be called to reset
+ * the system
+ * @nb: Info about handler function to be called
+ * @nb->priority: Handler priority. Handlers should follow the
+ * following guidelines for setting priorities.
+ * 0: Restart handler of last resort,
+ * with limited restart capabilities
+ * 128: Default restart handler; use if no other
+ * restart handler is expected to be available,
+ * and/or if restart functionality is
+ * sufficient to restart the entire system
+ * 255: Highest priority restart handler, will
+ * preempt all other restart handlers
+ *
+ * Registers a function with code to be called to restart the
+ * system.
+ *
+ * Registered functions will be called from machine_restart as last
+ * step of the restart sequence (if the architecture specific
+ * machine_restart function calls do_kernel_restart - see below
+ * for details).
+ * Registered functions are expected to restart the system immediately.
+ * If more than one function is registered, the restart handler priority
+ * selects which function will be called first.
+ *
+ * Restart handlers are expected to be registered from non-architecture
+ * code, typically from drivers. A typical use case would be a system
+ * where restart functionality is provided through a watchdog. Multiple
+ * restart handlers may exist; for example, one restart handler might
+ * restart the entire system, while another only restarts the CPU.
+ * In such cases, the restart handler which only restarts part of the
+ * hardware is expected to register with low priority to ensure that
+ * it only runs if no other means to restart the system is available.
+ *
+ * Currently always returns zero, as atomic_notifier_chain_register()
+ * always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+
+/**
+ * unregister_restart_handler - Unregister previously registered
+ * restart handler
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered restart handler function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+
+/**
+ * do_kernel_restart - Execute kernel restart handler call chain
+ *
+ * Calls functions registered with register_restart_handler.
+ *
+ * Expected to be called from machine_restart as last step of the restart
+ * sequence.
+ *
+ * Restarts the system immediately if a restart handler function has been
+ * registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+ atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
+
void migrate_to_reboot_cpu(void)
{
/* The boot cpu is always logical cpu 0 */
@@ -306,8 +387,9 @@ void ctrl_alt_del(void)
}
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
-static int __orderly_poweroff(bool force)
+static int run_cmd(const char *cmd)
{
char **argv;
static char *envp[] = {
@@ -316,8 +398,7 @@ static int __orderly_poweroff(bool force)
NULL
};
int ret;
-
- argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ argv = argv_split(GFP_KERNEL, cmd, NULL);
if (argv) {
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
argv_free(argv);
@@ -325,8 +406,33 @@ static int __orderly_poweroff(bool force)
ret = -ENOMEM;
}
+ return ret;
+}
+
+static int __orderly_reboot(void)
+{
+ int ret;
+
+ ret = run_cmd(reboot_cmd);
+
+ if (ret) {
+ pr_warn("Failed to start orderly reboot: forcing the issue\n");
+ emergency_sync();
+ kernel_restart(NULL);
+ }
+
+ return ret;
+}
+
+static int __orderly_poweroff(bool force)
+{
+ int ret;
+
+ ret = run_cmd(poweroff_cmd);
+
if (ret && force) {
pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+
/*
* I guess this should try to kick off some daemon to sync and
* poweroff asap. Or not even bother syncing if we're doing an
@@ -355,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
* This may be called from any context to trigger a system shutdown.
* If the orderly shutdown fails, it will force an immediate shutdown.
*/
-int orderly_poweroff(bool force)
+void orderly_poweroff(bool force)
{
if (force) /* do not override the pending "true" */
poweroff_force = true;
schedule_work(&poweroff_work);
- return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
+static void reboot_work_func(struct work_struct *work)
+{
+ __orderly_reboot();
+}
+
+static DECLARE_WORK(reboot_work, reboot_work_func);
+
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+ schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
+
static int __init reboot_setup(char *str)
{
for (;;) {
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * resource cgroups
- *
- * Copyright 2007 OpenVZ SWsoft Inc
- *
- * Author: Pavel Emelianov <xemul@openvz.org>
- *
- */
-
-#include <linux/types.h>
-#include <linux/parser.h>
-#include <linux/fs.h>
-#include <linux/res_counter.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-
-void res_counter_init(struct res_counter *counter, struct res_counter *parent)
-{
- spin_lock_init(&counter->lock);
- counter->limit = RES_COUNTER_MAX;
- counter->soft_limit = RES_COUNTER_MAX;
- counter->parent = parent;
-}
-
-static u64 res_counter_uncharge_locked(struct res_counter *counter,
- unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
-static int res_counter_charge_locked(struct res_counter *counter,
- unsigned long val, bool force)
-{
- int ret = 0;
-
- if (counter->usage + val > counter->limit) {
- counter->failcnt++;
- ret = -ENOMEM;
- if (!force)
- return ret;
- }
-
- counter->usage += val;
- if (counter->usage > counter->max_usage)
- counter->max_usage = counter->usage;
- return ret;
-}
-
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at, bool force)
-{
- int ret, r;
- unsigned long flags;
- struct res_counter *c, *u;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val, force);
- spin_unlock(&c->lock);
- if (r < 0 && !ret) {
- ret = r;
- *limit_fail_at = c;
- if (!force)
- break;
- }
- }
-
- if (ret < 0 && !force) {
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-
-u64 res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
-{
- unsigned long flags;
- struct res_counter *c;
- u64 ret = 0;
-
- local_irq_save(flags);
- for (c = counter; c != top; c = c->parent) {
- u64 r;
- spin_lock(&c->lock);
- r = res_counter_uncharge_locked(c, val);
- if (c == counter)
- ret = r;
- spin_unlock(&c->lock);
- }
- local_irq_restore(flags);
- return ret;
-}
-
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
- return res_counter_uncharge_until(counter, NULL, val);
-}
-
-static inline unsigned long long *
-res_counter_member(struct res_counter *counter, int member)
-{
- switch (member) {
- case RES_USAGE:
- return &counter->usage;
- case RES_MAX_USAGE:
- return &counter->max_usage;
- case RES_LIMIT:
- return &counter->limit;
- case RES_FAILCNT:
- return &counter->failcnt;
- case RES_SOFT_LIMIT:
- return &counter->soft_limit;
- };
-
- BUG();
- return NULL;
-}
-
-ssize_t res_counter_read(struct res_counter *counter, int member,
- const char __user *userbuf, size_t nbytes, loff_t *pos,
- int (*read_strategy)(unsigned long long val, char *st_buf))
-{
- unsigned long long *val;
- char buf[64], *s;
-
- s = buf;
- val = res_counter_member(counter, member);
- if (read_strategy)
- s += read_strategy(*val, s);
- else
- s += sprintf(s, "%llu\n", *val);
- return simple_read_from_buffer((void __user *)userbuf, nbytes,
- pos, buf, s - buf);
-}
-
-#if BITS_PER_LONG == 32
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&counter->lock, flags);
- ret = *res_counter_member(counter, member);
- spin_unlock_irqrestore(&counter->lock, flags);
-
- return ret;
-}
-#else
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- return *res_counter_member(counter, member);
-}
-#endif
-
-int res_counter_memparse_write_strategy(const char *buf,
- unsigned long long *resp)
-{
- char *end;
- unsigned long long res;
-
- /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
- if (*buf == '-') {
- int rc = kstrtoull(buf + 1, 10, &res);
-
- if (rc)
- return rc;
- if (res != 1)
- return -EINVAL;
- *resp = RES_COUNTER_MAX;
- return 0;
- }
-
- res = memparse(buf, &end);
- if (*end != '\0')
- return -EINVAL;
-
- if (PAGE_ALIGN(res) >= res)
- res = PAGE_ALIGN(res);
- else
- res = RES_COUNTER_MAX;
-
- *resp = res;
-
- return 0;
-}
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/resource_ext.h>
#include <asm/io.h>
@@ -59,10 +60,12 @@ static DEFINE_RWLOCK(resource_lock);
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
{
- struct resource *p = v;
- (*pos)++;
+ /* Caller wants to traverse through siblings only */
+ if (sibling_only)
+ return p->sibling;
+
if (p->child)
return p->child;
while (!p->sibling && p->parent)
@@ -70,6 +73,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
return p->sibling;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ return (void *)next_resource(p, false);
+}
+
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +332,19 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
*/
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+ bool first_level_children_only)
{
resource_size_t start, end;
struct resource *p;
+ bool sibling_only = false;
BUG_ON(!res);
@@ -339,9 +352,12 @@ static int find_next_system_ram(struct resource *res, char *name)
end = res->end;
BUG_ON(start >= end);
+ if (first_level_children_only)
+ sibling_only = true;
+
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- /* system ram is just marked as IORESOURCE_MEM */
+
+ for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
@@ -353,6 +369,7 @@ static int find_next_system_ram(struct resource *res, char *name)
if ((p->end >= start) && (p->start < end))
break;
}
+
read_unlock(&resource_lock);
if (!p)
return -1;
@@ -365,6 +382,70 @@ static int find_next_system_ram(struct resource *res, char *name)
}
/*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+ void *arg, int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = flags;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, name, false))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+ int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, "System RAM", true))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+
+/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
@@ -382,7 +463,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (find_next_system_ram(&res, "System RAM") >= 0)) {
+ (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -411,6 +492,42 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+/*
+ * Search for a resouce entry that fully contains the specified region.
+ * If found, return 1 if it is RAM, 0 if not.
+ * If not found, or region is not fully contained, return -1
+ *
+ * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * a vast speed up over walking through the resource table page by page.
+ */
+int region_is_ram(resource_size_t start, unsigned long size)
+{
+ struct resource *p;
+ resource_size_t end = start + size - 1;
+ int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ const char *name = "System RAM";
+ int ret = -1;
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ if (end < p->start)
+ continue;
+
+ if (p->start <= start && end <= p->end) {
+ /* resource fully contains region */
+ if ((p->flags != flags) || strcmp(p->name, name))
+ ret = 0;
+ else
+ ret = 1;
+ break;
+ }
+ if (p->end < start)
+ break; /* not found */
+ }
+ read_unlock(&resource_lock);
+ return ret;
+}
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
@@ -917,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
*
* request_region creates a new busy region.
*
- * check_region returns non-zero if the area is already busy.
- *
* release_region releases a matching busy region.
*/
@@ -981,36 +1096,6 @@ struct resource * __request_region(struct resource *parent,
EXPORT_SYMBOL(__request_region);
/**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it. It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
- resource_size_t n)
-{
- struct resource * res;
-
- res = __request_region(parent, start, n, "check-region", 0);
- if (!res)
- return -EBUSY;
-
- release_resource(res);
- free_resource(res);
- return 0;
-}
-EXPORT_SYMBOL(__check_region);
-
-/**
* __release_region - release a previously reserved resource region
* @parent: parent resource descriptor
* @start: resource start address
@@ -1165,6 +1250,76 @@ int release_mem_region_adjustable(struct resource *parent,
/*
* Managed region resource
*/
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+ struct resource **r = ptr;
+
+ release_resource(*r);
+}
+
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+ struct resource *new)
+{
+ struct resource *conflict, **ptr;
+
+ ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ *ptr = new;
+
+ conflict = request_resource_conflict(root, new);
+ if (conflict) {
+ dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+ new, conflict->name, conflict);
+ devres_free(ptr);
+ return -EBUSY;
+ }
+
+ devres_add(dev, ptr);
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+ struct resource **ptr = res;
+
+ return *ptr == data;
+}
+
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+ WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+ new));
+}
+EXPORT_SYMBOL(devm_release_resource);
+
struct region_devres {
struct resource *parent;
resource_size_t start;
@@ -1343,6 +1498,30 @@ int iomem_is_exclusive(u64 addr)
return err;
}
+struct resource_entry *resource_list_create_entry(struct resource *res,
+ size_t extra_size)
+{
+ struct resource_entry *entry;
+
+ entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+ if (entry) {
+ INIT_LIST_HEAD(&entry->node);
+ entry->res = res ? res : &entry->__res;
+ }
+
+ return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+ struct resource_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, node)
+ resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
@@ -148,11 +144,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
goto out;
- t = p;
- do {
+ for_each_thread(p, t)
sched_move_task(t);
- } while_each_thread(p, t);
-
out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3ef6451e972e..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
static inline struct sched_clock_data *this_scd(void)
{
- return &__get_cpu_var(sched_clock_data);
+ return this_cpu_ptr(&sched_clock_data);
}
static inline struct sched_clock_data *cpu_sdc(int cpu)
@@ -420,3 +420,16 @@ u64 local_clock(void)
EXPORT_SYMBOL_GPL(cpu_clock);
EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
+ * for IO (which traditionally means blkio only).
*/
void __sched wait_for_completion_io(struct completion *x)
{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
@@ -267,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
unsigned long flags;
int ret = 1;
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!READ_ONCE(x->done))
+ return 0;
+
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
@@ -287,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- unsigned long flags;
- int ret = 1;
+ if (!READ_ONCE(x->done))
+ return false;
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc1638b33449..f9123a82cbb6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-#ifdef smp_mb__before_atomic
-void __smp_mb__before_atomic(void)
-{
- smp_mb__before_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__before_atomic);
-#endif
-
-#ifdef smp_mb__after_atomic
-void __smp_mb__after_atomic(void)
-{
- smp_mb__after_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__after_atomic);
-#endif
-
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -135,10 +119,14 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- if (rq->skip_clock_update > 0)
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_skip_update & RQCF_ACT_SKIP)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
@@ -243,6 +231,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int i;
+ struct inode *inode;
if (cnt > 63)
cnt = 63;
@@ -253,7 +242,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ mutex_lock(&inode->i_mutex);
i = sched_feat_set(cmp);
+ mutex_unlock(&inode->i_mutex);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -313,59 +306,8 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
/*
* this_rq_lock - lock this runqueue and disable interrupts.
@@ -442,7 +384,15 @@ static void __hrtick_start(void *arg)
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+ ktime_t time;
+ s64 delta;
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense and can cause timer DoS.
+ */
+ delta = max_t(s64, delay, 10000LL);
+ time = ktime_add_ns(timer->base->get_time(), delta);
hrtimer_set_expires(timer, time);
@@ -485,6 +435,11 @@ static __init void init_hrtick(void)
*/
void hrtick_start(struct rq *rq, u64 delay)
{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -587,30 +542,31 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
/*
- * resched_task - mark a task 'to be rescheduled now'.
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
{
+ struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_held(&rq->lock);
- if (test_tsk_need_resched(p))
+ if (test_tsk_need_resched(curr))
return;
- cpu = task_cpu(p);
+ cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(p);
+ set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(p))
+ if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +579,7 @@ void resched_cpu(int cpu)
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
return;
- resched_task(cpu_curr(cpu));
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -684,10 +640,16 @@ static void wake_up_idle_cpu(int cpu)
static bool wake_up_full_nohz_cpu(int cpu)
{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
- smp_send_reschedule(cpu);
+ tick_nohz_full_kick_cpu(cpu);
return true;
}
@@ -730,18 +692,32 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
- struct rq *rq;
+ /*
+ * FIFO realtime policy runs the highest priority task. Other runnable
+ * tasks are of a lower priority. The scheduler tick does nothing.
+ */
+ if (current->policy == SCHED_FIFO)
+ return true;
- rq = this_rq();
+ /*
+ * Round-robin realtime tasks time slice with other tasks at the same
+ * realtime priority. Is this task the only one at this priority?
+ */
+ if (current->policy == SCHED_RR) {
+ struct sched_rt_entity *rt_se = &current->rt;
- /* Make sure rq->nr_running update is visible after the IPI */
- smp_rmb();
+ return rt_se->run_list.prev == rt_se->run_list.next;
+ }
- /* More than one running task need preemption */
- if (rq->nr_running > 1)
- return false;
+ /*
+ * More than one running task need preemption.
+ * nr_running update is assumed to be visible
+ * after IPI is sent from wakers.
+ */
+ if (this_rq()->nr_running > 1)
+ return false;
- return true;
+ return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -999,6 +975,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1006,6 +985,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1022,7 +1002,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
- resched_task(rq->curr);
+ resched_curr(rq);
break;
}
}
@@ -1032,8 +1012,15 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
- rq->skip_clock_update = 1;
+ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ rq_clock_skip_update(rq, true);
+}
+
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
}
#ifdef CONFIG_SMP
@@ -1045,7 +1032,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !p->on_rq);
#ifdef CONFIG_LOCKDEP
/*
@@ -1066,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ struct task_migration_notifier tmn;
+
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ tmn.task = p;
+ tmn.from_cpu = task_cpu(p);
+ tmn.to_cpu = new_cpu;
+
+ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
@@ -1077,7 +1072,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
src_rq = task_rq(p);
@@ -1203,7 +1198,7 @@ static int migration_cpu_stop(void *data);
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, on_rq;
+ int running, queued;
unsigned long ncsw;
struct rq *rq;
@@ -1241,7 +1236,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1273,7 +1268,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1398,7 +1393,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1467,7 +1463,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
@@ -1526,7 +1522,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
int ret = 0;
rq = __task_rq_lock(p);
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
@@ -1568,9 +1564,7 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
- if (llist_empty(&this_rq()->wake_list)
- && !tick_nohz_full_cpu(smp_processor_id())
- && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
return;
/*
@@ -1587,7 +1581,6 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
- tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -1612,6 +1605,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
}
}
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (is_idle_task(rq->curr))
+ smp_send_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1734,7 +1751,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
@@ -1768,6 +1785,24 @@ int wake_up_state(struct task_struct *p, unsigned int state)
}
/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
+}
+
+/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
@@ -1783,6 +1818,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SMP
+ p->se.avg.decay_count = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -1790,11 +1828,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- p->dl.dl_runtime = p->dl.runtime = 0;
- p->dl.dl_deadline = p->dl.deadline = 0;
- p->dl.dl_period = 0;
- p->dl.flags = 0;
+ init_dl_task_timer(&p->dl);
+ __dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -1817,12 +1852,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
+ p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
- INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -1969,6 +2002,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -1977,6 +2012,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -1994,25 +2031,6 @@ static inline int dl_bw_cpus(int i)
}
#endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -2020,6 +2038,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2087,7 +2108,7 @@ void wake_up_new_task(struct task_struct *p)
init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -2180,7 +2201,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
/**
* finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -2192,10 +2212,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
*/
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
+ struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
@@ -2235,6 +2261,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
tick_nohz_task_switch(current);
+ return rq;
}
#ifdef CONFIG_SMP
@@ -2269,29 +2296,22 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ struct rq *rq;
- /*
- * FIXME: do we need to worry about rq being invalidated by the
- * task_switch?
- */
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
post_schedule(rq);
-
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
- /* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
-#endif
+
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
*/
-static inline void
+static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -2325,21 +2345,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
-
barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+
+ return finish_task_switch(prev);
}
/*
@@ -2358,6 +2371,18 @@ unsigned long nr_running(void)
return sum;
}
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+ if (cpu_rq(smp_processor_id())->nr_running == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -2385,6 +2410,13 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *this = this_rq();
+ *nr_waiters = atomic_read(&this->nr_iowait);
+ *load = this->cpu_load[0];
+}
+
#ifdef CONFIG_SMP
/*
@@ -2422,39 +2454,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- if (task_current(rq, p)) {
- update_rq_clock(rq);
- ns = rq_clock_task(rq) - p->se.exec_start;
- if ((s64)ns < 0)
- ns = 0;
- }
-
- return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns = 0;
-
- rq = task_rq_lock(p, &flags);
- ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2463,7 +2462,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- u64 ns = 0;
+ u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
@@ -2474,13 +2473,24 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we race with it leaving cpu, we'll take a lock. So we're correct.
* If we race with it entering cpu, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
*/
- if (!p->on_cpu)
+ if (!p->on_cpu || !task_on_rq_queued(p))
return p->se.sum_exec_runtime;
#endif
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);
return ns;
@@ -2638,6 +2648,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
*/
static inline void schedule_debug(struct task_struct *prev)
{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path. Otherwise whine
@@ -2727,6 +2740,10 @@ again:
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
*/
static void __sched __schedule(void)
{
@@ -2735,11 +2752,10 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
-need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
prev = rq->curr;
schedule_debug(prev);
@@ -2755,6 +2771,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2779,36 +2797,27 @@ need_resched:
switch_count = &prev->nvcsw;
}
- if (prev->on_rq || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev))
update_rq_clock(rq);
next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->skip_clock_update = 0;
+ rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
++*switch_count;
- context_switch(rq, prev, next); /* unlocks the rq */
- /*
- * The context switch have flipped the stack from under us
- * and restored the local variables which were saved when
- * this task called schedule() in the past. prev == current
- * is still correct, but it can be moved to another cpu/rq.
- */
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
} else
raw_spin_unlock_irq(&rq->lock);
post_schedule(rq);
sched_preempt_enable_no_resched();
- if (need_resched())
- goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2828,7 +2837,9 @@ asmlinkage __visible void __sched schedule(void)
struct task_struct *tsk = current;
sched_submit_work(tsk);
- __schedule();
+ do {
+ __schedule();
+ } while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2840,10 +2851,14 @@ asmlinkage __visible void __sched schedule_user(void)
* or we have been woken up remotely but the IPI has not yet arrived,
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * too frequently to make sense yet.
*/
- user_exit();
+ enum ctx_state prev_state = exception_enter();
schedule();
- user_enter();
+ exception_exit(prev_state);
}
#endif
@@ -2859,6 +2874,21 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -2874,20 +2904,51 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
do {
__preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
/*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
*/
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
} while (need_resched());
}
-NOKPROBE_SYMBOL(preempt_schedule);
-EXPORT_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#endif /* CONFIG_PREEMPT */
/*
@@ -2944,7 +3005,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
@@ -2971,15 +3032,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
}
trace_sched_pi_setprio(p, prio);
- p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
/*
* Boosting condition are:
@@ -2991,8 +3051,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
- dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
@@ -3008,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
} else {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
p->sched_class = &fair_sched_class;
}
@@ -3015,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
@@ -3026,7 +3089,7 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, on_rq;
+ int old_prio, delta, queued;
unsigned long flags;
struct rq *rq;
@@ -3047,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
p->static_prio = NICE_TO_PRIO(nice);
@@ -3057,14 +3120,14 @@ void set_user_nice(struct task_struct *p, long nice)
p->prio = effective_prio(p);
delta = p->prio - old_prio;
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
*/
if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_task(rq->curr);
+ resched_curr(rq);
}
out_unlock:
task_rq_unlock(rq, p, &flags);
@@ -3192,23 +3255,45 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
- if (policy == -1) /* setparam */
+ if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
@@ -3317,13 +3402,27 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+static bool dl_param_changed(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int retval, oldprio, oldpolicy = -1, queued, running;
int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
@@ -3445,7 +3544,7 @@ recheck:
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
- if (dl_policy(policy))
+ if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
p->sched_reset_on_fork = reset_on_fork;
@@ -3520,19 +3619,19 @@ change:
return 0;
}
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
@@ -3557,10 +3656,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
- /*
- * Fixup the legacy SCHED_RESET_ON_FORK hack
- */
- if (policy & SCHED_RESET_ON_FORK) {
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
@@ -3730,7 +3827,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
- return do_sched_setscheduler(pid, -1, param);
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
/**
@@ -3958,14 +4055,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_unlock;
+ goto out_free_new_mask;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_unlock;
+ goto out_free_new_mask;
cpuset_cpus_allowed(p, cpus_allowed);
@@ -3978,13 +4075,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
* root_domain.
*/
#ifdef CONFIG_SMP
- if (task_has_dl_policy(p)) {
- const struct cpumask *span = task_rq(p)->rd->span;
-
- if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
retval = -EBUSY;
- goto out_unlock;
+ rcu_read_unlock();
+ goto out_free_new_mask;
}
+ rcu_read_unlock();
}
#endif
again:
@@ -4002,7 +4100,7 @@ again:
goto again;
}
}
-out_unlock:
+out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
@@ -4138,17 +4236,10 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-static void __cond_resched(void)
-{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
int __sched _cond_resched(void)
{
if (should_resched()) {
- __cond_resched();
+ preempt_schedule_common();
return 1;
}
return 0;
@@ -4173,7 +4264,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
- __cond_resched();
+ preempt_schedule_common();
else
cpu_relax();
ret = 1;
@@ -4189,7 +4280,7 @@ int __sched __cond_resched_softirq(void)
if (should_resched()) {
local_bh_enable();
- __cond_resched();
+ preempt_schedule_common();
local_bh_disable();
return 1;
}
@@ -4285,7 +4376,7 @@ again:
* fairness.
*/
if (preempt && rq != p_rq)
- resched_task(p_rq->curr);
+ resched_curr(p_rq);
}
out_unlock:
@@ -4304,36 +4395,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -4444,9 +4528,10 @@ void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned state;
+ unsigned long state = p->state;
- state = p->state ? __ffs(p->state) + 1 : 0;
+ if (state)
+ state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
@@ -4463,8 +4548,10 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ ppid = 0;
rcu_read_lock();
- ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
task_pid_nr(p), ppid,
@@ -4486,7 +4573,7 @@ void show_state_filter(unsigned long state_filter)
" task PC stack pid father\n");
#endif
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
@@ -4494,7 +4581,7 @@ void show_state_filter(unsigned long state_filter)
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
touch_all_softlockup_watchdogs();
@@ -4549,7 +4636,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
- idle->on_rq = 1;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4569,10 +4656,115 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ if (!cpumask_weight(cur))
+ return ret;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
#ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
+ if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -4626,14 +4818,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (p->on_rq) {
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
- }
+ } else if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
out:
task_rq_unlock(rq, p, &flags);
@@ -4654,20 +4847,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- struct rq *rq_dest, *rq_src;
+ struct rq *rq;
int ret = 0;
if (unlikely(!cpu_active(dest_cpu)))
return ret;
- rq_src = cpu_rq(src_cpu);
- rq_dest = cpu_rq(dest_cpu);
+ rq = cpu_rq(src_cpu);
raw_spin_lock(&p->pi_lock);
- double_rq_lock(rq_src, rq_dest);
+ raw_spin_lock(&rq->lock);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto done;
+
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
goto fail;
@@ -4676,16 +4869,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->on_rq) {
- dequeue_task(rq_src, p, 0);
- set_task_cpu(p, dest_cpu);
- enqueue_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p, 0);
- }
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
done:
ret = 1;
fail:
- double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -4717,22 +4906,22 @@ void sched_setnuma(struct task_struct *p, int nid)
{
struct rq *rq;
unsigned long flags;
- bool on_rq, running;
+ bool queued, running;
rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
- p->sched_class->put_prev_task(rq, p);
+ put_prev_task(rq, p);
p->numa_preferred_nid = nid;
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
@@ -4752,6 +4941,12 @@ static int migration_cpu_stop(void *data)
* be on another cpu but it doesn't matter.
*/
local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
local_irq_enable();
return 0;
@@ -5160,31 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- unsigned long flags;
- long cpu = (long)hcpu;
-
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active(cpu, false);
-
- /* explicitly allow suspend */
- if (!(action & CPU_TASKS_FROZEN)) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
- bool overflow;
- int cpus;
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- if (overflow)
- return notifier_from_errno(-EBUSY);
- }
+ set_cpu_active((long)hcpu, false);
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
}
-
- return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5232,9 +5409,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
- char str[256];
- cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5247,7 +5422,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n", str, sd->name);
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5266,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5292,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
@@ -5650,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
@@ -5720,7 +5881,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
- struct sched_domain *child;
+ struct sched_domain *sibling;
int i;
cpumask_clear(covered);
@@ -5731,10 +5892,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
- child = *per_cpu_ptr(sdd->sd, i);
+ sibling = *per_cpu_ptr(sdd->sd, i);
/* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5744,10 +5905,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
- if (child->child) {
- child = child->child;
- cpumask_copy(sg_span, sched_domain_span(child));
- } else
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
cpumask_set_cpu(i, sg_span);
cpumask_or(covered, covered, sg_span);
@@ -5762,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5985,7 +6144,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
@@ -6071,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6157,7 +6319,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
}
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
{
int i;
@@ -6172,6 +6334,56 @@ static bool find_numa_distance(int distance)
return false;
}
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6225,6 +6437,10 @@ static void sched_init_numa(void)
if (!sched_debug())
break;
}
+
+ if (!level)
+ return;
+
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
@@ -6304,6 +6520,9 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
}
static void sched_domains_numa_masks_set(int cpu)
@@ -6465,6 +6684,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
sd->child = child;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
}
set_domain_attribute(sd, attr);
@@ -6765,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
*/
case CPU_ONLINE:
- case CPU_DOWN_FAILED:
cpuset_update_active_cpus(true);
break;
default:
@@ -6777,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action) {
+ unsigned long flags;
+ long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
@@ -6865,9 +7119,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
- alloc_size += num_possible_cpus() * cpumask_size();
-#endif
if (alloc_size) {
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -6887,13 +7138,13 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
+ }
#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (void *)ptr;
- ptr += cpumask_size();
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -6926,8 +7177,8 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
- init_rt_rq(&rq->rt, rq);
- init_dl_rq(&rq->dl, rq);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6967,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7005,6 +7256,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7014,11 +7270,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7042,6 +7293,24 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7061,6 +7330,9 @@ void __might_sleep(const char *file, int line, int preempt_offset)
in_atomic(), irqs_disabled(),
current->pid, current->comm);
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
@@ -7073,7 +7345,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
#endif
dump_stack();
}
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -7084,15 +7356,15 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
.sched_policy = SCHED_NORMAL,
};
int old_prio = p->prio;
- int on_rq;
+ int queued;
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
__setscheduler(rq, p, &attr);
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
- resched_task(rq->curr);
+ resched_curr(rq);
}
check_class_changed(rq, p, prev_class, old_prio);
@@ -7104,12 +7376,12 @@ void normalize_rt_tasks(void)
unsigned long flags;
struct rq *rq;
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, p) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
/*
* Only normalize user tasks:
*/
- if (!p->mm)
+ if (p->flags & PF_KTHREAD)
continue;
p->se.exec_start = 0;
@@ -7124,21 +7396,16 @@ void normalize_rt_tasks(void)
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (task_nice(p) < 0 && p->mm)
+ if (task_nice(p) < 0)
set_user_nice(p, 0);
continue;
}
- raw_spin_lock(&p->pi_lock);
- rq = __task_rq_lock(p);
-
+ rq = task_rq_lock(p, &flags);
normalize_task(rq, p);
-
- __task_rq_unlock(rq);
- raw_spin_unlock(&p->pi_lock);
- } while_each_thread(g, p);
-
- read_unlock_irqrestore(&tasklist_lock, flags);
+ task_rq_unlock(rq, p, &flags);
+ }
+ read_unlock(&tasklist_lock);
}
#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7278,36 +7545,40 @@ void sched_offline_group(struct task_group *tg)
void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
- int on_rq, running;
+ int queued, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->on_rq;
+ queued = task_on_rq_queued(tsk);
- if (on_rq)
+ if (queued)
dequeue_task(rq, tsk, 0);
if (unlikely(running))
- tsk->sched_class->put_prev_task(rq, tsk);
+ put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgrp_id,
- lockdep_is_held(&tsk->sighand->siglock)),
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, on_rq);
+ tsk->sched_class->task_move_group(tsk, queued);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, tsk, &flags);
@@ -7325,10 +7596,16 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
- do_each_thread(g, p) {
- if (rt_task(p) && task_rq(p)->rt.tg == tg)
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
return 1;
- } while_each_thread(g, p);
+ }
return 0;
}
@@ -7417,6 +7694,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7473,9 +7761,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -7532,11 +7817,12 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
int cpu, ret = 0;
unsigned long flags;
@@ -7550,13 +7836,16 @@ static int sched_dl_global_constraints(void)
* solutions is welcome!
*/
for_each_possible_cpu(cpu) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
if (ret)
break;
}
@@ -7567,6 +7856,7 @@ static int sched_dl_global_constraints(void)
static void sched_dl_do_global(void)
{
u64 new_bw = -1;
+ struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -7580,11 +7870,14 @@ static void sched_dl_do_global(void)
* FIXME: As above...
*/
for_each_possible_cpu(cpu) {
- struct dl_bw *dl_b = dl_bw_of(cpu);
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
dl_b->bw = new_bw;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
}
}
@@ -7625,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
+ ret = sched_dl_global_validate();
if (ret)
goto undo;
- ret = sched_dl_global_constraints();
+ ret = sched_rt_global_constraints();
if (ret)
goto undo;
@@ -7714,6 +8007,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -7803,6 +8101,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;
+ /*
+ * Prevent race between setting of cfs_rq->runtime_enabled and
+ * unthrottle_offline_cfs_rqs().
+ */
+ get_online_cpus();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -7828,7 +8131,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
}
raw_spin_unlock_irq(&cfs_b->lock);
- for_each_possible_cpu(i) {
+ for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
@@ -7844,6 +8147,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
+ put_online_cpus();
return ret;
}
@@ -7959,7 +8263,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
quota = normalize_cfs_quota(tg, d);
- parent_quota = parent_b->hierarchal_quota;
+ parent_quota = parent_b->hierarchical_quota;
/*
* ensure max(child_quota) <= parent_quota, inherit when no
@@ -7970,7 +8274,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
else if (parent_quota != RUNTIME_INF && quota > parent_quota)
return -EINVAL;
}
- cfs_b->hierarchal_quota = quota;
+ cfs_b->hierarchical_quota = quota;
return 0;
}
@@ -8080,10 +8384,11 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .base_cftypes = cpu_files,
+ .legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .base_cftypes = files,
+ .legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
- if (later_mask && cpumask_and(later_mask, cp->free_cpus,
- &p->cpus_allowed) && cpumask_and(later_mask,
- later_mask, cpu_active_mask)) {
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -188,6 +187,26 @@ out:
}
/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
@@ -205,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
if (!cp->elements)
return -ENOMEM;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
kfree(cp->elements);
return -ENOMEM;
}
@@ -213,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
for_each_possible_cpu(i)
cp->elements[i].idx = IDX_INVALID;
- cpumask_setall(cp->free_cpus);
-
return 0;
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,10 +24,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
struct signal_struct *sig = tsk->signal;
cputime_t utime, stime;
struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
+ unsigned int seq, nextseq;
+ unsigned long flags;
rcu_read_lock();
- /* make sure we can trust tsk->thread_group list */
- if (!likely(pid_alive(tsk)))
- goto out;
-
- t = tsk;
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
do {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- } while_each_thread(tsk, t);
-out:
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
rcu_read_unlock();
}
@@ -550,6 +555,23 @@ drop_precision:
}
/*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+ cputime_t old;
+
+ while (new > (old = ACCESS_ONCE(*counter)))
+ cmpxchg_cputime(counter, old, new);
+}
+
+/*
* Adjust tick based cputime random precision against scheduler
* runtime accounting.
*/
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
utime = rtime - stime;
}
- /*
- * If the tick based count grows faster than the scheduler one,
- * the result of the scaling may go backward.
- * Let's enforce monotonicity.
- */
- prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, utime);
+ cputime_advance(&prev->stime, stime);
+ cputime_advance(&prev->utime, utime);
out:
*ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
-/*
- * Must be called with siglock held.
- */
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
dl_b->total_bw = 0;
}
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->rb_root = RB_ROOT;
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
rq->post_schedule = has_pushable_dl_tasks(rq);
}
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, ENQUEUE_REPLENISH);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+}
+
#else
static inline
@@ -306,7 +352,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* the overrunning entity can't interfere with other entity in the system and
* can't make them miss their deadlines. Reasons why this kind of overruns
* could happen are, typically, a entity voluntarily trying to overcome its
- * runtime, or it just underestimated it during sched_setscheduler_ex().
+ * runtime, or it just underestimated it during sched_setattr().
*/
static void replenish_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -350,6 +396,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
}
/*
@@ -506,47 +557,76 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(p, &flags);
/*
- * We need to take care of a possible races here. In fact, the
- * task might have changed its scheduling policy to something
- * different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setattr()).
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
*/
- if (!dl_task(p) || dl_se->dl_new)
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
update_rq_clock(rq);
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- if (p->on_rq) {
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_task(rq->curr);
+
#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
- */
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
+ /*
+ * If we find that the rq the task was on is no longer
+ * available, we need to select a new rq.
+ */
+ if (unlikely(!rq->online)) {
+ dl_task_offline_migration(rq, p);
+ goto unlock;
+ }
#endif
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
}
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, p, &flags);
return HRTIMER_NORESTART;
}
@@ -555,11 +635,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -567,24 +642,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
static
int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
{
- int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
- int rorun = dl_se->runtime <= 0;
-
- if (!rorun && !dmiss)
- return 0;
-
- /*
- * If we are beyond our current deadline and we are still
- * executing, then we have already used some of the runtime of
- * the next instance. Thus, if we do not account that, we are
- * stealing bandwidth from the system at each deadline miss!
- */
- if (dmiss) {
- dl_se->runtime = rorun ? dl_se->runtime : 0;
- dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
- }
-
- return 1;
+ return (dl_se->runtime <= 0);
}
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -625,16 +683,15 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
- dl_se->dl_throttled = 1;
- else
+ if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
- resched_task(curr);
+ resched_curr(rq);
}
/*
@@ -823,10 +880,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
- replenish_dl_entity(dl_se, pi_se);
- else
+ if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
+ else if (flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
__enqueue_dl_entity(dl_se);
}
@@ -847,8 +904,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
/*
* If p is throttled, we do nothing. In fact, if it exhausted
@@ -856,7 +924,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
*/
- if (p->dl.dl_throttled)
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -901,7 +969,14 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -914,7 +989,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (sd_flag != SD_BALANCE_WAKE)
goto out;
rq = cpu_rq(cpu);
@@ -964,7 +1039,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
- resched_task(rq->curr);
+ resched_curr(rq);
}
static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +1054,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -997,10 +1072,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SCHED_HRTICK
static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
- s64 delta = p->dl.dl_runtime - p->dl.runtime;
-
- if (delta > 10000)
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, p->dl.runtime);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
}
#endif
@@ -1030,7 +1106,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
- if (rq->stop && rq->stop->on_rq)
+ if (rq->stop && task_on_rq_queued(rq->stop))
return RETRY_TASK;
}
@@ -1055,10 +1131,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
/* Running task will never be pushed. */
dequeue_pushable_dl_task(rq, p);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
-#endif
set_post_schedule(rq);
@@ -1077,10 +1151,14 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
-#ifdef CONFIG_SCHED_HRTICK
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
-#endif
}
static void task_fork_dl(struct task_struct *p)
@@ -1100,6 +1178,7 @@ static void task_dead_dl(struct task_struct *p)
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
@@ -1124,10 +1203,8 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
- (p->nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
-
return 0;
}
@@ -1158,7 +1235,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
{
struct sched_domain *sd;
- struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+ struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
int best_cpu, cpu = task_cpu(task);
@@ -1169,6 +1246,10 @@ static int find_later_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1;
+ /*
+ * We have to consider system topology and task affinity
+ * first, then we can look for a suitable cpu.
+ */
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
task, later_mask);
if (best_cpu == -1)
@@ -1257,7 +1338,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
- task_running(rq, task) || !task->on_rq)) {
+ task_running(rq, task) ||
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1296,7 +1378,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
return p;
@@ -1311,6 +1393,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ int ret = 0;
if (!rq->dl.overloaded)
return 0;
@@ -1333,7 +1416,7 @@ retry:
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1356,7 +1439,6 @@ retry:
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
*/
- dequeue_pushable_dl_task(rq, next_task);
goto out;
}
@@ -1372,15 +1454,16 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ ret = 1;
- resched_task(later_rq->curr);
+ resched_curr(later_rq);
double_unlock_balance(rq, later_rq);
out:
put_task_struct(next_task);
- return 1;
+ return ret;
}
static void push_dl_tasks(struct rq *rq)
@@ -1443,7 +1526,7 @@ static int pull_dl_task(struct rq *this_rq)
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* Then we pull iff p has actually an earlier
@@ -1486,7 +1569,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
- dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
}
@@ -1495,10 +1578,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
struct rq *rq;
+ struct root_domain *src_rd;
int weight;
BUG_ON(!dl_task(p));
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
/*
* Update only if the task is actually running (i.e.,
* it is on the rq AND it is not throttled).
@@ -1515,8 +1621,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
- rq = task_rq(p);
-
/*
* The process used to be able to migrate OR it can now migrate
*/
@@ -1540,6 +1644,7 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
}
@@ -1551,6 +1656,7 @@ static void rq_offline_dl(struct rq *rq)
dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
void init_sched_dl_class(void)
@@ -1564,20 +1670,48 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ /* XXX we should retain the bw until 0-lag */
+ cancel_dl_timer(rq, p);
+ __dl_clear_params(p);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
@@ -1588,22 +1722,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
- /*
- * If p is throttled, don't consider the possibility
- * of preempting rq->curr, the check will be done right
- * after its runtime will get replenished.
- */
- if (unlikely(p->dl.dl_throttled))
- return;
-
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
- if (check_resched && task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
}
}
@@ -1614,7 +1745,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (p->on_rq || rq->curr == p) {
+ if (task_on_rq_queued(p) || rq->curr == p) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
@@ -1632,14 +1763,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/*
* Again, we don't know if p has a earlier
* or later deadline, so let's blindly set a
* (maybe not needed) rescheduling point.
*/
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else
switched_to_dl(rq, p);
@@ -1673,4 +1804,15 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
if (!se) {
struct sched_avg *avg = &cpu_rq(cpu)->avg;
P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ P(avg->avg_period);
return;
}
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
+ P(se->avg.running_avg_sum);
+ P(se->avg.avg_period);
P(se->avg.load_avg_contrib);
+ P(se->avg.utilization_avg_contrib);
P(se->avg.decay_count);
#endif
#undef PN
@@ -150,7 +152,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- unsigned long flags;
SEQ_printf(m,
"\nrunnable tasks:\n"
@@ -159,16 +160,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
"------------------------------------------------------"
"----------------------------------------------------\n");
- read_lock_irqsave(&tasklist_lock, flags);
-
- do_each_thread(g, p) {
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
- } while_each_thread(g, p);
-
- read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ rcu_read_unlock();
}
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -217,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
+ cfs_rq->utilization_load_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
@@ -264,6 +265,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
@@ -302,6 +309,7 @@ do { \
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
+ PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
@@ -332,10 +340,9 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
- rcu_read_lock();
print_rq(m, rq, cpu);
- rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
@@ -533,8 +540,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults_memory)
- nr_faults = p->numa_faults_memory[2*node + i];
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
@@ -633,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
+ P(se.avg.running_avg_sum);
+ P(se.avg.avg_period);
P(se.avg.load_avg_contrib);
+ P(se.avg.utilization_avg_contrib);
P(se.avg.decay_count);
#endif
P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
+#include <linux/cpuidle.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
@@ -665,20 +666,22 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
{
u32 slice;
- p->se.avg.decay_count = 0;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+ p->se.avg.avg_period = slice;
__update_task_entity_contrib(&p->se);
+ __update_task_entity_utilization(&p->se);
}
#else
void init_task_runnable_average(struct task_struct *p)
@@ -724,6 +727,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -826,11 +834,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -865,7 +874,6 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
- struct list_head task_list;
struct rcu_head rcu;
nodemask_t active_nodes;
@@ -893,18 +901,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0;
}
-static inline int task_faults_idx(int nid, int priv)
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
- p->numa_faults_memory[task_faults_idx(nid, 1)];
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -912,14 +926,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
- p->numa_group->faults[task_faults_idx(nid, 1)];
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(nid, 0)] +
- group->faults_cpu[task_faults_idx(nid, 1)];
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
}
/*
@@ -928,11 +1007,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
@@ -940,15 +1020,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1038,7 +1132,8 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int cpu, cpus = 0;
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,9 +1157,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
if (!cpus)
return;
- ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
- ns->task_capacity =
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ ns->task_capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
@@ -1077,6 +1175,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
@@ -1096,32 +1195,59 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
- long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
- long imb, old_imb;
+ long src_capacity, dst_capacity;
+ long orig_src_load;
+ long load_a, load_b;
+ long moved_load;
+ long imb;
+
+ /*
+ * The load is corrected for the CPU capacity available on each node.
+ *
+ * src_load dst_load
+ * ------------ vs ---------
+ * src_capacity dst_capacity
+ */
+ src_capacity = env->src_stats.compute_capacity;
+ dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- if (dst_load < src_load)
- swap(dst_load, src_load);
+ load_a = dst_load;
+ load_b = src_load;
+ if (load_a < load_b)
+ swap(load_a, load_b);
/* Is the difference below the threshold? */
- imb = dst_load * 100 - src_load * env->imbalance_pct;
+ imb = load_a * src_capacity * 100 -
+ load_b * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Compare it with the old imbalance.
+ * Allow a move that brings us closer to a balanced situation,
+ * without moving things past the point of balance.
*/
- if (orig_dst_load < orig_src_load)
- swap(orig_dst_load, orig_src_load);
+ orig_src_load = env->src_stats.load;
- old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+ /*
+ * In a task swap, there will be one load moving from src to dst,
+ * and another moving back. This is the net sum of both moves.
+ * A simple task move will always have a positive value.
+ * Allow the move if it brings the system closer to a balanced
+ * situation, without crossing over the balance point.
+ */
+ moved_load = orig_src_load - src_load;
- /* Would this change make things worse? */
- return (imb > old_imb);
+ if (moved_load > 0)
+ /* Moving src -> dst. Did we overshoot balance? */
+ return src_load * dst_capacity < dst_load * src_capacity;
+ else
+ /* Moving dst -> src. Did we overshoot balance? */
+ return dst_load * src_capacity < src_load * dst_capacity;
}
/*
@@ -1136,15 +1262,33 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
- long orig_src_load, src_load;
- long orig_dst_load, dst_load;
+ long src_load, dst_load;
long load;
- long imp = (groupimp > 0) ? groupimp : taskimp;
+ long imp = env->p->numa_group ? groupimp : taskimp;
+ long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
/*
* "imp" is the fault differential for the source task between the
@@ -1163,8 +1307,8 @@ static void task_numa_compare(struct task_numa_env *env,
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
@@ -1177,26 +1321,21 @@ static void task_numa_compare(struct task_numa_env *env,
* itself (not part of a group), use the task weight
* instead.
*/
- if (env->p->numa_group)
- imp = groupimp;
- else
- imp = taskimp;
-
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
- if (imp < env->best_imp)
+ if (imp <= env->best_imp && moveimp <= env->best_imp)
goto unlock;
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_free_capacity &&
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
@@ -1204,20 +1343,34 @@ static void task_numa_compare(struct task_numa_env *env,
}
/* Balance doesn't matter much if we're running a task per cpu */
- if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ if (imp > env->best_imp && src_rq->nr_running == 1 &&
+ dst_rq->nr_running == 1)
goto assign;
/*
* In the overloaded case, try and keep the load balanced.
*/
balance:
- orig_dst_load = env->dst_stats.load;
- orig_src_load = env->src_stats.load;
-
- /* XXX missing capacity terms */
load = task_h_load(env->p);
- dst_load = orig_dst_load + load;
- src_load = orig_src_load - load;
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ if (moveimp > imp && moveimp > env->best_imp) {
+ /*
+ * If the improvement from just moving env->p direction is
+ * better than swapping tasks around, check if a move is
+ * possible. Store a slightly smaller score than moveimp,
+ * so an actually idle CPU will win.
+ */
+ if (!load_too_imbalanced(src_load, dst_load, env)) {
+ imp = moveimp - 1;
+ cur = NULL;
+ goto assign;
+ }
+ }
+
+ if (imp <= env->best_imp)
+ goto unlock;
if (cur) {
load = task_h_load(cur);
@@ -1225,10 +1378,16 @@ balance:
src_load += load;
}
- if (load_too_imbalanced(orig_src_load, orig_dst_load,
- src_load, dst_load, env))
+ if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
+ /*
+ * One idle CPU per node is evaluated for a task numa move.
+ * Call select_idle_sibling to maybe find a better one.
+ */
+ if (!cur)
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
assign:
task_numa_assign(env, cur, imp);
unlock:
@@ -1266,7 +1425,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
@@ -1294,40 +1453,51 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has free capacity, try to use it. */
- if (env.dst_stats.has_free_capacity)
- task_numa_find_cpu(&env, taskimp, groupimp);
+ /* Try to find a spot on the preferred nid. */
+ task_numa_find_cpu(&env, taskimp, groupimp);
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
- /* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
- return -EAGAIN;
-
/*
* If the task is part of a workload that spans multiple NUMA nodes,
* and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1506,19 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
- sched_setnuma(p, env.dst_nid);
+ if (p->numa_group) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+ nid = env.dst_nid;
+
+ if (node_isset(nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
/*
* Reset the scan period if the task is being rescheduled on an
@@ -1365,7 +1546,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1415,12 +1596,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
*/
#define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
/*
* Increase the scan period (slow down scanning) if the majority of
@@ -1441,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
- * to automatic numa balancing. Scan slower
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
*/
- if (local + shared == 0) {
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
@@ -1477,7 +1660,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
@@ -1505,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ *period = p->se.avg.avg_period;
}
p->last_sum_exec_runtime = runtime;
@@ -1514,6 +1697,94 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group = NODE_MASK_NONE;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1541,18 +1812,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
- int priv, i;
+ int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
- i = task_faults_idx(nid, priv);
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
- fault_types[priv] += p->numa_faults_buffer_memory[i];
- p->numa_faults_buffer_memory[i] = 0;
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
@@ -1562,21 +1838,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
- p->numa_faults_buffer_cpu[i] = 0;
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
- p->numa_faults_memory[i] += diff;
- p->numa_faults_cpu[i] += f_diff;
- faults += p->numa_faults_memory[i];
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->faults_cpu[i] += f_diff;
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
+ group_faults += p->numa_group->faults[mem_idx];
}
}
@@ -1595,30 +1877,17 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
- /*
- * If the preferred task and group nids are different,
- * iterate over the nodes again to find the best place.
- */
- if (max_nid != max_group_nid) {
- unsigned long weight, max_weight = 0;
-
- for_each_online_node(nid) {
- weight = task_weight(p, nid) + group_weight(p, nid);
- if (weight > max_weight) {
- max_weight = weight;
- max_nid = nid;
- }
- }
- }
-
spin_unlock_irq(group_lock);
+ max_nid = preferred_group_nid(p, max_group_nid);
}
- /* Preferred node as the node with the most faults */
- if (max_faults && max_nid != p->numa_preferred_nid) {
- /* Update the preferred nid and migrate task if possible */
- sched_setnuma(p, max_nid);
- numa_migrate_preferred(p);
+ if (max_faults) {
+ /* Set the new preferred node */
+ if (max_nid != p->numa_preferred_nid)
+ sched_setnuma(p, max_nid);
+
+ if (task_node(p) != p->numa_preferred_nid)
+ numa_migrate_preferred(p);
}
}
@@ -1652,7 +1921,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
atomic_set(&grp->refcount, 1);
spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1661,11 +1929,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] = p->numa_faults_memory[i];
+ grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
- list_add(&p->numa_entry, &grp->task_list);
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
@@ -1720,13 +1987,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults_memory[i];
- grp->faults[i] += p->numa_faults_memory[i];
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
- list_move(&p->numa_entry, &grp->task_list);
my_grp->nr_tasks--;
grp->nr_tasks++;
@@ -1746,27 +2012,23 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults_memory;
+ void *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
- list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
- rcu_assign_pointer(p->numa_group, NULL);
+ RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
- p->numa_faults_cpu= NULL;
- p->numa_faults_buffer_cpu = NULL;
+ p->numa_faults = NULL;
kfree(numa_faults);
}
@@ -1788,29 +2050,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
- /* Do not worry about placement if exiting */
- if (p->state == TASK_DEAD)
- return;
-
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults_memory)) {
- int size = sizeof(*p->numa_faults_memory) *
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults_memory)
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
return;
- BUG_ON(p->numa_faults_buffer_memory);
- /*
- * The averaged statistics, shared & private, memory & cpu,
- * occupy the first half of the array. The second half of the
- * array is for current counters, which are averaged into the
- * first set by task_numa_placement.
- */
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1849,9 +2097,11 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
@@ -1930,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma)) {
continue;
+ }
/*
* Shared library pages mapped by multiple processes are not
@@ -2195,8 +2447,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
/*
* As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
- * With a look-up table which covers k^n (n<PERIOD)
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
@@ -2266,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
delta = now - sa->last_runnable_update;
/*
@@ -2294,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
@@ -2307,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta_w = 1024 - delta_w;
if (runnable)
sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ if (running)
+ sa->running_avg_sum += delta_w * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta_w;
delta -= delta_w;
@@ -2317,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
+ periods + 1);
+ sa->avg_period = decay_load(sa->avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ if (running)
+ sa->running_avg_sum += runnable_contrib * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable)
sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
+ if (running)
+ sa->running_avg_sum += delta * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta;
return decayed;
}
@@ -2342,11 +2607,13 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
+ se->avg.decay_count = 0;
if (!decays)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.decay_count = 0;
+ se->avg.utilization_avg_contrib =
+ decay_load(se->avg.utilization_avg_contrib, decays);
return decays;
}
@@ -2361,6 +2628,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
+ if (!tg_contrib)
+ return;
+
if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic_long_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
@@ -2379,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
+ sa->avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2432,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+ runnable, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2450,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
+ contrib /= (se->avg.avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
@@ -2469,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
return se->avg.load_avg_contrib - old_contrib;
}
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.utilization_avg_contrib;
+
+ if (entity_is_task(se))
+ __update_task_entity_utilization(se);
+ else
+ se->avg.utilization_avg_contrib =
+ group_cfs_rq(se)->utilization_load_avg;
+
+ return se->avg.utilization_avg_contrib - old_contrib;
+}
+
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
@@ -2485,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
+ long contrib_delta, utilization_delta;
+ int cpu = cpu_of(rq_of(cfs_rq));
u64 now;
/*
@@ -2497,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
if (!update_cfs_rq)
return;
- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ cfs_rq->utilization_load_avg += utilization_delta;
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
}
/*
@@ -2583,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -2601,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -2899,7 +3201,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -2923,7 +3225,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static void
@@ -2938,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3063,7 +3366,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
return;
}
/*
@@ -3254,7 +3557,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static __always_inline
@@ -3360,7 +3663,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ /*
+ * Add to the _head_ of the list, so that an already-started
+ * distribute_cfs_runtime will not see us
+ */
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
if (!cfs_b->timer_active)
__start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3717,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_task(rq->curr);
+ resched_curr(rq);
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
- u64 runtime = remaining;
+ u64 runtime;
+ u64 starting_runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3756,7 @@ next:
}
rcu_read_unlock();
- return remaining;
+ return starting_runtime - remaining;
}
/*
@@ -3494,22 +3802,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- /*
- * There are throttled entities so we must first use the new bandwidth
- * to unthrottle them before making it generally available. This
- * ensures that all existing debts will be paid before a new cfs_rq is
- * allowed to run.
- */
- runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
- cfs_b->runtime = 0;
/*
- * This check is repeated as we are holding onto the new bandwidth
- * while we unthrottle. This can potentially race with an unthrottled
- * group trying to acquire new bandwidth from the global pool.
+ * This check is repeated as we are holding onto the new bandwidth while
+ * we unthrottle. This can potentially race with an unthrottled group
+ * trying to acquire new bandwidth from the global pool. This can result
+ * in us over-using our runtime if it is all used during this loop, but
+ * only by limited amounts in that extreme case.
*/
- while (throttled && runtime > 0) {
+ while (throttled && cfs_b->runtime > 0) {
+ runtime = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3820,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
}
- /* return (any) remaining runtime */
- cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3934,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
}
- if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- cfs_b->runtime = 0;
- }
+
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@@ -3645,7 +3947,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime = runtime;
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
raw_spin_unlock(&cfs_b->lock);
}
@@ -3771,10 +4073,27 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+ raw_spin_unlock(&cfs_b->lock);
+ }
+}
+
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
@@ -3788,6 +4107,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
+ /*
+ * Offline rq is schedulable till cpu is completely disabled
+ * in take_cpu_down(), so we prevent new cfs throttling here.
+ */
+ cfs_rq->runtime_enabled = 0;
+
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -3831,6 +4156,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,17 +4180,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
return;
}
-
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- if (rq->curr != p)
- delta = max_t(s64, 10000LL, delta);
-
hrtick_start(rq, delta);
}
}
@@ -4049,10 +4367,15 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
@@ -4178,7 +4501,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * tg->shares) / W;
+ wl = (w * (long)tg->shares) / W;
else
wl = tg->shares;
@@ -4241,8 +4564,8 @@ static int wake_wide(struct task_struct *p)
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
+ s64 this_eff_load, prev_eff_load;
int idx, this_cpu, prev_cpu;
- unsigned long tl_per_task;
struct task_group *tg;
unsigned long weight;
int balanced;
@@ -4285,47 +4608,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load > 0) {
- s64 this_eff_load, prev_eff_load;
+ this_eff_load = 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
+ if (this_load > 0) {
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
- balanced = this_eff_load <= prev_eff_load;
- } else
- balanced = true;
-
- /*
- * If the currently running task will sleep within
- * a reasonable amount of time then attract this newly
- * woken task:
- */
- if (sync && balanced)
- return 1;
+ balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
- if (balanced ||
- (this_load <= load &&
- this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ if (!balanced)
+ return 0;
- return 1;
- }
- return 0;
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
}
/*
@@ -4393,20 +4699,46 @@ static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
- int idlest = -1;
+ unsigned int min_exit_latency = UINT_MAX;
+ u64 latest_idle_timestamp = 0;
+ int least_loaded_cpu = this_cpu;
+ int shallowest_idle_cpu = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- load = weighted_cpuload(i);
-
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- idlest = i;
+ if (idle_cpu(i)) {
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ rq->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If equal or no active idle state, then
+ * the most recently idled CPU might have
+ * a warmer cache.
+ */
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else if (shallowest_idle_cpu == -1) {
+ load = weighted_cpuload(i);
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
}
}
- return idlest;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
/*
@@ -4453,6 +4785,33 @@ next:
done:
return target;
}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -4475,14 +4834,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (p->nr_cpus_allowed == 1)
- return prev_cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
- want_affine = 1;
- new_cpu = prev_cpu;
- }
+ if (sd_flag & SD_BALANCE_WAKE)
+ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -4669,7 +5022,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -4723,7 +5076,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_task(curr);
+ resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -4904,7 +5257,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
set_skip_buddy(se);
@@ -5077,28 +5430,18 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ struct list_head tasks;
};
/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
* Is this task likely cache-hot:
*/
-static int
-task_hot(struct task_struct *p, u64 now)
+static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
+ lockdep_assert_held(&env->src_rq->lock);
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -5108,7 +5451,7 @@ task_hot(struct task_struct *p, u64 now)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
@@ -5118,7 +5461,7 @@ task_hot(struct task_struct *p, u64 now)
if (sysctl_sched_migration_cost == 0)
return 0;
- delta = now - p->se.exec_start;
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
@@ -5130,7 +5473,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -5169,7 +5512,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -5218,6 +5561,9 @@ static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -5272,28 +5618,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+ tsk_cache_hot = task_hot(p, env);
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
- if (migrate_improves_locality(p, env)) {
-#ifdef CONFIG_SCHEDSTATS
- if (tsk_cache_hot) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
- }
-#endif
- return 1;
- }
-
- if (!tsk_cache_hot ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-
+ if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-
return 1;
}
@@ -5302,47 +5636,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}
/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
*
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
*/
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
+ lockdep_assert_held(&env->src_rq->lock);
+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
- move_task(p, env);
+ detach_task(p, env);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
@@ -5373,14 +5723,16 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5400,13 +5752,58 @@ next:
}
/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
- return pulled;
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5525,6 +5922,13 @@ static unsigned long task_h_load(struct task_struct *p)
#endif
/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -5534,12 +5938,12 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
- int group_has_free_capacity;
+ enum group_type group_type;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5576,6 +5980,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
},
};
}
@@ -5608,35 +6014,23 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- return SCHED_CAPACITY_SCALE;
-}
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_capacity(sd, cpu);
-}
-
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
-{
- unsigned long weight = sd->span_weight;
- unsigned long smt_gain = sd->smt_gain;
-
- smt_gain /= weight;
-
- return smt_gain;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_capacity(sd, cpu);
+ return default_scale_cpu_capacity(sd, cpu);
}
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
@@ -5645,52 +6039,35 @@ static unsigned long scale_rt_capacity(int cpu)
*/
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
- delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
-
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ used = div_u64(avg, total);
- total >>= SCHED_CAPACITY_SHIFT;
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- return div_u64(available, total);
+ return 1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long weight = sd->span_weight;
unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_smt_capacity(sd, cpu);
- else
- capacity *= default_scale_smt_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
- }
-
- sdg->sgc->capacity_orig = capacity;
-
if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
+ capacity *= arch_scale_cpu_capacity(sd, cpu);
else
- capacity *= default_scale_capacity(sd, cpu);
+ capacity *= default_scale_cpu_capacity(sd, cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
+
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -5705,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -5717,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5737,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -5760,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -5830,31 +6188,62 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
+
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
+
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ return false;
+}
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_no_capacity)
+ return group_overloaded;
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+ if (sg_imbalanced(group))
+ return group_imbalanced;
- return capacity_factor;
+ return group_other;
}
/**
@@ -5864,10 +6253,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs)
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
{
unsigned long load;
int i;
@@ -5884,7 +6275,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
- sgs->sum_nr_running += rq->nr_running;
+ sgs->group_usage += get_cpu_usage(i);
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
+
+ if (rq->nr_running > 1)
+ *overload = true;
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5903,11 +6299,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
- sgs->group_imb = sg_imbalanced(group);
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
-
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -5928,13 +6321,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
- return false;
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_type > busiest->group_type)
return true;
- if (sgs->group_imb)
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/*
@@ -5942,8 +6341,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -5995,6 +6393,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+ bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -6015,24 +6414,28 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
if (local_group)
goto next_group;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6049,6 +6452,13 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ if (env->dst_rq->rd->overload != overload)
+ env->dst_rq->rd->overload = overload;
+ }
+
}
/**
@@ -6179,7 +6589,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (busiest->group_imb) {
+ if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6199,17 +6609,17 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return fix_small_imbalance(env, sds);
}
- if (!busiest->group_imb) {
- /*
- * Don't want to pull so many tasks that a group would go idle.
- * Except of course for the group_imb case, since then we might
- * have to drop below capacity to reach cpu-load equilibrium.
- */
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ /*
+ * If there aren't any idle cpus, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6272,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6288,16 +6699,16 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (busiest->group_imb)
+ if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
- * If the local group is more busy than the selected busiest group
+ * If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
@@ -6312,13 +6723,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (env->idle == CPU_IDLE) {
/*
- * This cpu is idle. If the busiest group load doesn't
- * have more tasks than the number of available cpu's and
- * there is no imbalance between this and busiest group
- * wrt to idle cpu's, it is balanced.
+ * This cpu is idle. If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest group
+ * wrt idle cpus, it is balanced. The imbalance becomes
+ * significant if the diff is greater than 1 otherwise we
+ * might end up to just move the imbalance on another group
*/
- if ((local->idle_cpus < busiest->idle_cpus) &&
- busiest->sum_nr_running <= busiest->group_weight)
+ if ((busiest->group_type != group_overloaded) &&
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
goto out_balanced;
} else {
/*
@@ -6351,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6380,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6390,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
@@ -6438,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6490,7 +6914,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
@@ -6501,6 +6925,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};
/*
@@ -6536,6 +6961,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6545,28 +6973,33 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
- local_irq_restore(flags);
+ cur_ld_moved = detach_tasks(&env);
/*
- * some other cpu did the load balance for us.
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
*/
- if (cur_ld_moved && env.dst_cpu != smp_processor_id())
- resched_cpu(env.dst_cpu);
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
+ local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -6616,10 +7049,8 @@ more_balance:
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
- } else if (*group_imbalance)
- *group_imbalance = 0;
}
/* All tasks on this runqueue were pinned by CPU affinity */
@@ -6630,7 +7061,7 @@ more_balance:
env.loop_break = sched_nr_migrate_break;
goto redo;
}
- goto out_balanced;
+ goto out_all_pinned;
}
}
@@ -6695,7 +7126,7 @@ more_balance:
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
@@ -6704,6 +7135,23 @@ more_balance:
goto out;
out_balanced:
+ /*
+ * We reach balance although we may have faced some affinity
+ * constraints. Clear the imbalance flag if it was set.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
+out_all_pinned:
+ /*
+ * We reach balance because all tasks are pinned at this level so
+ * we can't migrate them. Let the imbalance flag set so parent level
+ * can try to migrate them.
+ */
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
@@ -6767,7 +7215,8 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -6864,6 +7313,7 @@ static int active_load_balance_cpu_stop(void *data)
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock);
@@ -6883,9 +7333,6 @@ static int active_load_balance_cpu_stop(void *data)
*/
BUG_ON(busiest_rq == target_rq);
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -6906,16 +7353,22 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
return 0;
}
@@ -7219,22 +7672,25 @@ end:
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7248,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
if (rq->nr_running >= 2)
- goto need_kick;
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7295,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_rq, idle);
-
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
- * stopped.
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
}
/*
@@ -7325,6 +7791,8 @@ void trigger_load_balance(struct rq *rq)
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
+
+ update_runtime_enabled(rq);
}
static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7866,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_task(rq->curr);
+ resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -7413,7 +7881,7 @@ static void task_fork_fair(struct task_struct *p)
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7423,7 +7891,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (rq->curr == p) {
if (p->prio > oldprio)
- resched_task(rq->curr);
+ resched_curr(rq);
} else
check_preempt_curr(rq, p, 0);
}
@@ -7438,11 +7906,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7469,15 +7937,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *se = &p->se;
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7486,7 +7954,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* if we can still preempt the current task.
*/
if (rq->curr == p)
- resched_task(rq->curr);
+ resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
}
@@ -7523,7 +7991,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
@@ -7542,7 +8010,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
@@ -7553,14 +8021,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
@@ -7783,6 +8251,8 @@ const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .update_curr = update_curr_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!tif_need_resched())
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -79,7 +81,8 @@ static void cpuidle_idle_call(void)
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- bool broadcast;
+ unsigned int broadcast;
+ bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -103,25 +106,37 @@ static void cpuidle_idle_call(void)
*/
rcu_idle_enter();
+ if (cpuidle_not_available(drv, dev))
+ goto use_default;
+
/*
- * Ask the cpuidle framework to choose a convenient idle state.
- * Fall back to the default arch idle method on errors.
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
*/
- next_state = cpuidle_select(drv, dev);
- if (next_state < 0) {
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state >= 0) {
local_irq_enable();
- else
- arch_cpu_idle();
+ goto exit_idle;
+ }
- goto exit_idle;
+ reflect = false;
+ next_state = cpuidle_find_deepest_state(drv, dev);
+ } else {
+ reflect = true;
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ */
+ next_state = cpuidle_select(drv, dev);
}
-
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0)
+ goto use_default;
/*
* The idle task must be scheduled, it is pointless to
@@ -135,7 +150,7 @@ use_default:
goto exit_idle;
}
- broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+ broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
/*
* Tell the time framework to switch to a broadcast timer
@@ -143,11 +158,11 @@ use_default:
* is used from another cpu as a broadcast timer, this call may
* fail if it is not available
*/
- if (broadcast &&
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ if (broadcast && tick_broadcast_enter())
goto use_default;
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
+ /* Take note of the planned idle state. */
+ idle_set_state(this_rq(), &drv->states[next_state]);
/*
* Enter the idle state previously returned by the governor decision.
@@ -156,15 +171,17 @@ use_default:
*/
entered_state = cpuidle_enter(drv, dev, next_state);
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+ /* The cpu is no longer idle or about to enter idle. */
+ idle_set_state(this_rq(), NULL);
if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+ tick_broadcast_exit();
/*
* Give the governor an opportunity to reflect on the outcome
*/
- cpuidle_reflect(dev, entered_state);
+ if (reflect)
+ cpuidle_reflect(dev, entered_state);
exit_idle:
__current_set_polling();
@@ -177,8 +194,23 @@ exit_idle:
rcu_idle_exit();
start_critical_timings();
+ return;
+
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
/*
* Generic idle loop implementation
*
@@ -203,8 +235,13 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id()))
+ if (cpu_is_offline(smp_processor_id())) {
+ rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+ (void *)(long)smp_processor_id());
+ smp_mb(); /* all activity before dead. */
+ this_cpu_write(cpu_dead_idle, true);
arch_cpu_idle_dead();
+ }
local_irq_disable();
arch_cpu_idle_enter();
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
- resched_task(rq->idle);
+ resched_curr(rq);
}
static struct task_struct *
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
return 0;
}
+static void update_curr_idle(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
};
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
#include "sched.h"
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
/*
* Global load-average calculations
*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
+#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_rt_rq(rt_rq, cpu_rq(i));
+ init_rt_rq(rt_rq);
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
@@ -463,9 +475,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
- int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+ int cpu = cpu_of(rq);
rt_se = rt_rq->tg->rt_se[cpu];
@@ -476,7 +489,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
enqueue_rt_entity(rt_se, false);
if (rt_rq->highest_prio.curr < curr->prio)
- resched_task(curr);
+ resched_curr(rq);
}
}
@@ -566,7 +579,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
return;
enqueue_top_rt_rq(rt_rq);
- resched_task(rq->curr);
+ resched_curr(rq);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +753,9 @@ balanced:
rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ /* Make rt_rq available for pick_next_task() */
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -827,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
enqueue = 1;
/*
- * Force a clock update if the CPU was idle,
- * lest wakeup -> unthrottle time accumulate.
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq->skip_clock_update = -1;
+ rq_clock_skip_update(rq, false);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -948,7 +967,7 @@ static void update_curr_rt(struct rq *rq)
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
- resched_task(curr);
+ resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
@@ -1297,9 +1316,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (p->nr_cpus_allowed == 1)
- goto out;
-
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1336,7 +1352,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
- if (target != -1)
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
rcu_read_unlock();
@@ -1347,23 +1368,29 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->nr_cpus_allowed == 1)
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
-
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
* to try and push current away:
*/
requeue_task_rt(rq, p, 1);
- resched_task(rq->curr);
+ resched_curr(rq);
}
#endif /* CONFIG_SMP */
@@ -1374,7 +1401,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -1444,7 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
* means a dl or stop task can slip in, in which case we need
* to re-start task selection.
*/
- if (unlikely((rq->stop && rq->stop->on_rq) ||
+ if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1464,8 +1491,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
- if (p)
- dequeue_pushable_task(rq, p);
+ dequeue_pushable_task(rq, p);
set_post_schedule(rq);
@@ -1522,7 +1548,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
- struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
@@ -1608,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
lowest_rq = cpu_rq(cpu);
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
/* if the prio of this runqueue changed, try again */
if (double_lock_balance(rq, lowest_rq)) {
/*
@@ -1620,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task->on_rq)) {
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1654,7 +1690,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
return p;
@@ -1690,7 +1726,7 @@ retry:
* just reschedule current.
*/
if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1737,7 +1773,7 @@ retry:
activate_task(lowest_rq, next_task, 0);
ret = 1;
- resched_task(lowest_rq->curr);
+ resched_curr(lowest_rq);
double_unlock_balance(rq, lowest_rq);
@@ -1754,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
;
}
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+ /*
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+
+ } else if (cpu >= nr_cpu_ids) {
+ /*
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
+ */
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+ }
+ rq->rt.push_cpu = cpu;
+
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
+ }
+
+ return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu;
+
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
+
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+ irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
+ int cpu;
+
+ this_cpu = rt_rq->push_cpu;
+
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ /* Pass the IPI to the next rt overloaded queue */
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
+ }
+
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1769,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return 0;
+ }
+#endif
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1805,7 +2006,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* There's a chance that p is higher in priority
@@ -1866,7 +2067,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
BUG_ON(!rt_task(p));
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
weight = cpumask_weight(new_mask);
@@ -1932,11 +2133,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!p->on_rq || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
if (pull_rt_task(rq))
- resched_task(rq->curr);
+ resched_curr(rq);
}
void __init init_sched_rt_class(void)
@@ -1966,7 +2167,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
@@ -1974,7 +2175,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
@@ -1985,7 +2186,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
if (rq->curr == p) {
@@ -2003,11 +2204,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* Only reschedule if p is still on the same runqueue.
*/
if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
if (oldprio < p->prio)
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else {
/*
@@ -2016,7 +2217,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* then reschedule.
*/
if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
@@ -2069,7 +2270,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {
requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ resched_curr(rq);
return;
}
}
@@ -2125,6 +2326,8 @@ const struct sched_class rt_sched_class = {
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
@@ -14,6 +15,11 @@
#include "cpuacct.h"
struct rq;
+struct cpuidle_state;
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
extern __read_mostly int scheduler_running;
@@ -126,6 +132,9 @@ struct rt_bandwidth {
u64 rt_runtime;
struct hrtimer rt_period_timer;
};
+
+void __dl_clear_params(struct task_struct *p);
+
/*
* To keep the bandwidth of -deadline tasks and groups under control
* we need some place where:
@@ -168,6 +177,25 @@ struct dl_bw {
u64 bw, total_bw;
};
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
extern struct mutex sched_domains_mutex;
#ifdef CONFIG_CGROUP_SCHED
@@ -184,7 +212,7 @@ struct cfs_bandwidth {
raw_spinlock_t lock;
ktime_t period;
u64 quota, runtime;
- s64 hierarchal_quota;
+ s64 hierarchical_quota;
u64 runtime_expires;
int idle, timer_active;
@@ -335,8 +363,14 @@ struct cfs_rq {
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
+ * runnable_load_avg is the sum of the load_avg_contrib of the
+ * sched_entities on the rq.
+ * blocked_load_avg is similar to runnable_load_avg except that its
+ * the blocked sched_entities on the rq.
+ * utilization_load_avg is the sum of the average running time of the
+ * sched_entities on the rq.
*/
- unsigned long runnable_load_avg, blocked_load_avg;
+ unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
atomic64_t decay_counter;
u64 last_decay;
atomic_long_t removed_load;
@@ -391,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -408,7 +447,13 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
#endif
+#endif /* CONFIG_SMP */
int rt_queued;
int rt_throttled;
@@ -477,6 +522,9 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -528,8 +576,6 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
- int skip_clock_update;
-
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -558,6 +604,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
+ unsigned int clock_skip_update;
u64 clock;
u64 clock_task;
@@ -568,6 +615,7 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
unsigned char idle_balance;
/* For active balancing */
@@ -633,6 +681,11 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
};
static inline int cpu_of(struct rq *rq)
@@ -644,25 +697,62 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() (&__get_cpu_var(runqueues))
+#define this_rq() this_cpu_ptr(&runqueues)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() (&__raw_get_cpu_var(runqueues))
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
static inline u64 rq_clock(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+ lockdep_assert_held(&rq->lock);
+ if (skip)
+ rq->clock_skip_update |= RQCF_REQ_SKIP;
+ else
+ rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -736,7 +826,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -884,20 +974,10 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
- return static_key_true(key); /* Not out of line branch. */
-}
-
-static __always_inline bool static_branch__false(struct static_key *key)
-{
- return static_key_false(key); /* Out of line branch. */
-}
-
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
- return static_branch__##enabled(key); \
+ return static_key_##enabled(key); \
}
#include "features.h"
@@ -949,6 +1029,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#endif
}
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
@@ -960,7 +1049,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -998,35 +1086,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
raw_spin_unlock_irq(&rq->lock);
}
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- */
- smp_wmb();
- prev->on_cpu = 0;
-#endif
- local_irq_enable();
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
/*
* wake flags
*/
@@ -1142,6 +1201,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1150,6 +1214,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
+ void (*update_curr) (struct rq *rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
@@ -1187,6 +1253,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
#endif
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
@@ -1196,7 +1286,7 @@ extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void init_sched_dl_class(void);
-extern void resched_task(struct task_struct *p);
+extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1308,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
rq->nr_running = prev_nr + count;
-#ifdef CONFIG_NO_HZ_FULL
if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(rq->cpu)) {
- /* Order rq->nr_running write against the IPI */
- smp_wmb();
- smp_send_reschedule(rq->cpu);
+ /*
+ * Tick is needed if more than one task runs on a CPU.
+ * Send the target an IPI to kick it out of nohz mode.
+ *
+ * We assume that IPI implies full memory barrier and the
+ * new value of rq->nr_running is visible on reception
+ * from the target.
+ */
+ tick_nohz_full_kick_cpu(rq->cpu);
}
- }
#endif
+ }
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
@@ -1286,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
sched_avg_update(rq);
}
#else
@@ -1298,6 +1408,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1482,10 +1668,11 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
- char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
- if (mask_str == NULL)
- return -ENOMEM;
if (v == (void *)1) {
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
}
- kfree(mask_str);
return 0;
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (!stop || !stop->on_rq)
+ if (!stop || !task_on_rq_queued(stop))
return NULL;
put_prev_task(rq, prev);
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
return 0;
}
+static void update_curr_stop(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/kthread.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
}
EXPORT_SYMBOL(autoremove_wake_function);
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
@@ -319,14 +385,14 @@ EXPORT_SYMBOL(wake_bit_function);
*/
int __sched
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
int ret = 0;
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(q->key.flags);
+ ret = (*action)(&q->key);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
@@ -334,7 +400,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit);
int __sched out_of_line_wait_on_bit(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -343,9 +409,21 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit);
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ wait.key.timeout = jiffies + timeout;
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
do {
int ret;
@@ -353,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
- ret = action(q->key.flags);
+ ret = action(&q->key);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +443,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit_lock);
int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +580,45 @@ void wake_up_atomic_t(atomic_t *p)
__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
}
EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ io_schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,18 @@
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
-/* #define SECCOMP_DEBUG 1 */
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
#ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
@@ -54,7 +57,7 @@
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
- struct sk_filter *prog;
+ struct bpf_prog *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -87,7 +90,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
* @filter: filter to verify
* @flen: length of filter
*
- * Takes a previously checked filter (by sk_chk_filter) and
+ * Takes a previously checked filter (by bpf_check_classic) and
* redirects all filter code that loads struct sk_buff data
* and related data through seccomp_bpf_load. It also
* enforces length and alignment checking of those loads.
@@ -170,53 +173,189 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(int syscall)
+static u32 seccomp_run_filters(struct seccomp_data *sd)
{
- struct seccomp_filter *f;
- struct seccomp_data sd;
+ struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
+ struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
- if (WARN_ON(current->seccomp.filter == NULL))
+ if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
- populate_seccomp_data(&sd);
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ smp_read_barrier_depends();
+
+ if (!sd) {
+ populate_seccomp_data(&sd_local);
+ sd = &sd_local;
+ }
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
- for (f = current->seccomp.filter; f; f = f->prev) {
- u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
+ for (; f; f = f->prev) {
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
return ret;
}
+#endif /* CONFIG_SECCOMP_FILTER */
+
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+ assert_spin_locked(&current->sighand->siglock);
+
+ if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+ return false;
+
+ return true;
+}
+
+static inline void seccomp_assign_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ assert_spin_locked(&task->sighand->siglock);
+
+ task->seccomp.mode = seccomp_mode;
+ /*
+ * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * filter) is set.
+ */
+ smp_mb__before_atomic();
+ set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (parent == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == parent)
+ return 1;
+ return 0;
+}
/**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate all threads being eligible for synchronization. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ pid_t failed;
+
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)))
+ continue;
+
+ /* Return the first thread that cannot be synchronized. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (unlikely(WARN_ON(failed == 0)))
+ failed = -ESRCH;
+ return failed;
+ }
+
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Synchronize all threads. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ /* Skip current, since it needs no changes. */
+ if (thread == caller)
+ continue;
+
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ smp_store_release(&thread->seccomp.filter,
+ caller->seccomp.filter);
+ /*
+ * Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ }
+ }
+}
+
+/**
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
*
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
*/
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *filter;
- unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
- unsigned long total_insns = fprog->len;
+ unsigned long fp_size;
struct sock_filter *fp;
int new_len;
long ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
- return -EINVAL;
-
- for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
- if (total_insns > MAX_INSNS_PER_PATH)
- return -ENOMEM;
+ return ERR_PTR(-EINVAL);
+ BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
+ fp_size = fprog->len * sizeof(struct sock_filter);
/*
* Installing a seccomp filter requires that the task has
@@ -224,14 +363,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
- if (!current->no_new_privs &&
+ if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
- return -EACCES;
+ return ERR_PTR(-EACCES);
fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
if (!fp)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/* Copy the instructions from fprog. */
ret = -EFAULT;
@@ -239,7 +378,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
goto free_prog;
/* Check and rewrite the fprog via the skb checker */
- ret = sk_chk_filter(fp, fprog->len);
+ ret = bpf_check_classic(fp, fprog->len);
if (ret)
goto free_prog;
@@ -248,8 +387,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (ret)
goto free_prog;
- /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
- ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+ /* Convert 'sock_filter' insns to 'bpf_insn' insns */
+ ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
if (ret)
goto free_prog;
@@ -260,48 +399,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (!filter)
goto free_prog;
- filter->prog = kzalloc(sk_filter_size(new_len),
- GFP_KERNEL|__GFP_NOWARN);
+ filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
if (!filter->prog)
goto free_filter;
- ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
if (ret)
goto free_filter_prog;
- kfree(fp);
+ kfree(fp);
atomic_set(&filter->usage, 1);
filter->prog->len = new_len;
- sk_filter_select_runtime(filter->prog);
+ bpf_prog_select_runtime(filter->prog);
- /*
- * If there is an existing filter, make it the prev and don't drop its
- * task reference.
- */
- filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
- return 0;
+ return filter;
free_filter_prog:
- kfree(filter->prog);
+ __bpf_prog_free(filter->prog);
free_filter:
kfree(filter);
free_prog:
kfree(fp);
- return ret;
+ return ERR_PTR(ret);
}
/**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
* @user_filter: pointer to the user data containing a sock_fprog.
*
* Returns 0 on success and non-zero otherwise.
*/
-static long seccomp_attach_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
{
struct sock_fprog fprog;
- long ret = -EFAULT;
+ struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
if (is_compat_task()) {
@@ -314,9 +447,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)
#endif
if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
goto out;
- ret = seccomp_attach_filter(&fprog);
+ filter = seccomp_prepare_filter(&fprog);
out:
- return ret;
+ return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags: flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Caller must be holding current->sighand->siglock lock.
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
+{
+ unsigned long total_insns;
+ struct seccomp_filter *walker;
+
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate resulting filter length. */
+ total_insns = filter->prog->len;
+ for (walker = current->seccomp.filter; walker; walker = walker->prev)
+ total_insns += walker->prog->len + 4; /* 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /* If thread sync has been requested, check that it is possible. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ int ret;
+
+ ret = seccomp_can_sync_threads();
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+
+ /* Now that the new filter is in place, synchronize to all threads. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ seccomp_sync_threads();
+
+ return 0;
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -329,6 +509,14 @@ void get_seccomp_filter(struct task_struct *tsk)
atomic_inc(&orig->usage);
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_free(filter->prog);
+ kfree(filter);
+ }
+}
+
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
@@ -337,8 +525,7 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
- sk_filter_free(freeme->prog);
- kfree(freeme);
+ seccomp_filter_free(freeme);
}
}
@@ -380,92 +567,185 @@ static int mode1_syscalls_32[] = {
};
#endif
-int __secure_computing(int this_syscall)
+static void __secure_computing_strict(int this_syscall)
+{
+ int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ syscall_whitelist = mode1_syscalls_32;
+#endif
+ do {
+ if (*syscall_whitelist == this_syscall)
+ return;
+ } while (*++syscall_whitelist);
+
+#ifdef SECCOMP_DEBUG
+ dump_stack();
+#endif
+ audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ do_exit(SIGKILL);
+}
+
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- int exit_sig = 0;
- int *syscall;
- u32 ret;
+
+ if (mode == 0)
+ return;
+ else if (mode == SECCOMP_MODE_STRICT)
+ __secure_computing_strict(this_syscall);
+ else
+ BUG();
+}
+#else
+int __secure_computing(void)
+{
+ u32 phase1_result = seccomp_phase1(NULL);
+
+ if (likely(phase1_result == SECCOMP_PHASE1_OK))
+ return 0;
+ else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+ return -1;
+ else
+ return seccomp_phase2(phase1_result);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+{
+ u32 filter_ret, action;
+ int data;
+
+ /*
+ * Make sure that any changes to mode from another thread have
+ * been seen after TIF_SECCOMP was seen.
+ */
+ rmb();
+
+ filter_ret = seccomp_run_filters(sd);
+ data = filter_ret & SECCOMP_RET_DATA;
+ action = filter_ret & SECCOMP_RET_ACTION;
+
+ switch (action) {
+ case SECCOMP_RET_ERRNO:
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+
+ case SECCOMP_RET_TRACE:
+ return filter_ret; /* Save the rest for phase 2. */
+
+ case SECCOMP_RET_ALLOW:
+ return SECCOMP_PHASE1_OK;
+
+ case SECCOMP_RET_KILL:
+ default:
+ audit_seccomp(this_syscall, SIGSYS, action);
+ do_exit(SIGSYS);
+ }
+
+ unreachable();
+
+skip:
+ audit_seccomp(this_syscall, 0, action);
+ return SECCOMP_PHASE1_SKIP;
+}
+#endif
+
+/**
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
+ * @arg sd: The seccomp_data or NULL
+ *
+ * This only reads pt_regs via the syscall_xyz helpers. The only change
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked. In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+ int mode = current->seccomp.mode;
+ int this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
switch (mode) {
case SECCOMP_MODE_STRICT:
- syscall = mode1_syscalls;
-#ifdef CONFIG_COMPAT
- if (is_compat_task())
- syscall = mode1_syscalls_32;
-#endif
- do {
- if (*syscall == this_syscall)
- return 0;
- } while (*++syscall);
- exit_sig = SIGKILL;
- ret = SECCOMP_RET_KILL;
- break;
+ __secure_computing_strict(this_syscall); /* may call do_exit */
+ return SECCOMP_PHASE1_OK;
#ifdef CONFIG_SECCOMP_FILTER
- case SECCOMP_MODE_FILTER: {
- int data;
- struct pt_regs *regs = task_pt_regs(current);
- ret = seccomp_run_filters(this_syscall);
- data = ret & SECCOMP_RET_DATA;
- ret &= SECCOMP_RET_ACTION;
- switch (ret) {
- case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
- syscall_set_return_value(current, regs,
- -data, 0);
- goto skip;
- case SECCOMP_RET_TRAP:
- /* Show the handler the original registers. */
- syscall_rollback(current, regs);
- /* Let the filter pass back 16 bits of data. */
- seccomp_send_sigsys(this_syscall, data);
- goto skip;
- case SECCOMP_RET_TRACE:
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- goto skip;
- }
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- break;
- if (syscall_get_nr(current, regs) < 0)
- goto skip; /* Explicit request to skip. */
-
- return 0;
- case SECCOMP_RET_ALLOW:
- return 0;
- case SECCOMP_RET_KILL:
- default:
- break;
- }
- exit_sig = SIGSYS;
- break;
- }
+ case SECCOMP_MODE_FILTER:
+ return __seccomp_phase1_filter(this_syscall, sd);
#endif
default:
BUG();
}
+}
-#ifdef SECCOMP_DEBUG
- dump_stack();
-#endif
- audit_seccomp(this_syscall, exit_sig, ret);
- do_exit(exit_sig);
-#ifdef CONFIG_SECCOMP_FILTER
-skip:
- audit_seccomp(this_syscall, exit_sig, ret);
-#endif
- return -1;
+/**
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
+ * @phase1_result: The return value from seccomp_phase1()
+ *
+ * This must be called from a context in which ptrace hooks can be used.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase2(u32 phase1_result)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ u32 action = phase1_result & SECCOMP_RET_ACTION;
+ int data = phase1_result & SECCOMP_RET_DATA;
+
+ BUG_ON(action != SECCOMP_RET_TRACE);
+
+ audit_seccomp(syscall_get_nr(current, regs), 0, action);
+
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
+ return -1;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ do_exit(SIGSYS);
+ if (syscall_get_nr(current, regs) < 0)
+ return -1; /* Explicit request to skip. */
+
+ return 0;
}
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
{
@@ -473,47 +753,152 @@ long prctl_get_seccomp(void)
}
/**
- * prctl_set_seccomp: configures current->seccomp.mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
*
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters. Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_strict(void)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+ long ret = -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ seccomp_assign_mode(current, seccomp_mode);
+ ret = 0;
+
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return ret;
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags: flags to change filter behavior
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+ struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
- if (current->seccomp.mode &&
- current->seccomp.mode != seccomp_mode)
+ /* Validate flags. */
+ if (flags & ~SECCOMP_FILTER_FLAG_MASK)
+ return -EINVAL;
+
+ /* Prepare the new filter before holding any locks. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+
+ /*
+ * Make sure we cannot change seccomp or nnp state via TSYNC
+ * while another thread is in the middle of calling exec.
+ */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+ mutex_lock_killable(&current->signal->cred_guard_mutex))
+ goto out_free;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+ ret = seccomp_attach_filter(flags, prepared);
+ if (ret)
goto out;
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
+
+ seccomp_assign_mode(current, seccomp_mode);
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+ seccomp_filter_free(prepared);
+ return ret;
+}
+#else
+static inline long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ return -EINVAL;
+}
+#endif
+
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs)
+{
+ switch (op) {
+ case SECCOMP_SET_MODE_STRICT:
+ if (flags != 0 || uargs != NULL)
+ return -EINVAL;
+ return seccomp_set_mode_strict();
+ case SECCOMP_SET_MODE_FILTER:
+ return seccomp_set_mode_filter(flags, uargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+ const char __user *, uargs)
+{
+ return do_seccomp(op, flags, uargs);
+}
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+ unsigned int op;
+ char __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
- ret = 0;
-#ifdef TIF_NOTSC
- disable_TSC();
-#endif
+ op = SECCOMP_SET_MODE_STRICT;
+ /*
+ * Setting strict mode through prctl always ignored filter,
+ * so make sure it is always NULL here to pass the internal
+ * check in do_seccomp().
+ */
+ uargs = NULL;
break;
-#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER:
- ret = seccomp_attach_user_filter(filter);
- if (ret)
- goto out;
+ op = SECCOMP_SET_MODE_FILTER;
+ uargs = filter;
break;
-#endif
default:
- goto out;
+ return -EINVAL;
}
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
-out:
- return ret;
+ /* prctl interface doesn't have flags, so they are always zero. */
+ return do_seccomp(op, 0, uargs);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index a4077e90f19f..d51c5ddd855c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
struct sighand_struct *sighand;
for (;;) {
+ /*
+ * Disable interrupts early to avoid deadlocks.
+ * See rcu_read_unlock() comment header for details.
+ */
local_irq_save(*flags);
rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
@@ -1271,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
local_irq_restore(*flags);
break;
}
-
+ /*
+ * This sighand can be already freed and even reused, but
+ * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * initializes ->siglock: this slab can't go away, it has
+ * the same object type, ->siglock can't be reinitialized.
+ *
+ * We need to ensure that tsk->sighand is still the same
+ * after we take the lock, we can race with de_thread() or
+ * __exit_signal(). In the latter case the next iteration
+ * must see ->sighand == NULL.
+ */
spin_lock(&sighand->siglock);
if (likely(sighand == tsk->sighand)) {
rcu_read_unlock();
@@ -1327,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
int error = -ESRCH;
struct task_struct *p;
- rcu_read_lock();
-retry:
- p = pid_task(pid, PIDTYPE_PID);
- if (p) {
- error = group_send_sig_info(sig, info, p);
- if (unlikely(error == -ESRCH))
- /*
- * The task was unhashed in between, try again.
- * If it is dead, pid_task() will return NULL,
- * if we race with de_thread() it will find the
- * new leader.
- */
- goto retry;
- }
- rcu_read_unlock();
+ for (;;) {
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+ rcu_read_unlock();
+ if (likely(!p || error != -ESRCH))
+ return error;
- return error;
+ /*
+ * The task was unhashed in between, try again. If it
+ * is dead, pid_task() will return NULL, if we race with
+ * de_thread() it will find the new leader.
+ */
+ }
}
int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2166,8 +2178,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
- struct pt_regs *regs, void *cookie)
+int get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2237,13 +2248,13 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, info);
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info);
+ signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
}
@@ -2251,13 +2262,13 @@ relock:
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
- trace_signal_deliver(signr, info, ka);
+ trace_signal_deliver(signr, &ksig->info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
- *return_ka = *ka;
+ ksig->ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
@@ -2307,7 +2318,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(info->si_signo))) {
+ if (likely(do_signal_stop(ksig->info.si_signo))) {
/* It released the siglock. */
goto relock;
}
@@ -2328,7 +2339,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(info->si_signo);
+ print_fatal_signal(ksig->info.si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2338,34 +2349,32 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info);
+ do_coredump(&ksig->info);
}
/*
* Death signals, no core dump.
*/
- do_group_exit(info->si_signo);
+ do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
- return signr;
+
+ ksig->sig = signr;
+ return ksig->sig > 0;
}
/**
* signal_delivered -
- * @sig: number of signal being delivered
- * @info: siginfo_t of signal being delivered
- * @ka: sigaction setting that chose the handler
- * @regs: user register state
+ * @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
* is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags. Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs, int stepping)
+static void signal_delivered(struct ksignal *ksig, int stepping)
{
sigset_t blocked;
@@ -2375,11 +2384,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
simply clear the restore sigmask flag. */
clear_restore_sigmask();
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
+ sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
+ if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
- tracehook_signal_handler(sig, info, ka, regs, stepping);
+ tracehook_signal_handler(stepping);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2387,8 +2396,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
if (failed)
force_sigsegv(ksig->sig, current);
else
- signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
- signal_pt_regs(), stepping);
+ signal_delivered(ksig, stepping);
}
/*
@@ -2493,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
*/
SYSCALL_DEFINE0(restart_syscall)
{
- struct restart_block *restart = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
return restart->fn(restart);
}
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
+#ifdef SEGV_BNDERR
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
@@ -2980,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -3029,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
@@ -3538,7 +3546,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
return -ERESTARTNOHAND;
@@ -3551,7 +3559,7 @@ int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
set_restore_sigmask();
return -ERESTARTNOHAND;
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..07854477c164 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*/
+#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
@@ -12,12 +13,13 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <linux/sched.h>
#include "smpboot.h"
enum {
CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_WAIT = 0x02,
+ CSD_FLAG_SYNCHRONOUS = 0x02,
};
struct call_function_data {
@@ -105,7 +107,7 @@ void __init call_function_init(void)
*/
static void csd_lock_wait(struct call_single_data *csd)
{
- while (csd->flags & CSD_FLAG_LOCK)
+ while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
cpu_relax();
}
@@ -119,19 +121,17 @@ static void csd_lock(struct call_single_data *csd)
* to ->flags with any subsequent assignments to other
* fields of the specified call_single_data structure:
*/
- smp_mb();
+ smp_wmb();
}
static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
- smp_mb();
-
- csd->flags &= ~CSD_FLAG_LOCK;
+ smp_store_release(&csd->flags, 0);
}
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
@@ -142,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
* ->func, ->info, and ->flags set.
*/
static int generic_exec_single(int cpu, struct call_single_data *csd,
- smp_call_func_t func, void *info, int wait)
+ smp_call_func_t func, void *info)
{
- struct call_single_data csd_stack = { .flags = 0 };
- unsigned long flags;
-
-
if (cpu == smp_processor_id()) {
+ unsigned long flags;
+
+ /*
+ * We can unlock early even for the synchronous on-stack case,
+ * since we're doing this from the same CPU..
+ */
+ csd_unlock(csd);
local_irq_save(flags);
func(info);
local_irq_restore(flags);
@@ -156,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
}
- if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+ if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ csd_unlock(csd);
return -ENXIO;
-
-
- if (!csd) {
- csd = &csd_stack;
- if (!wait)
- csd = &__get_cpu_var(csd_data);
}
- csd_lock(csd);
-
csd->func = func;
csd->info = info;
- if (wait)
- csd->flags |= CSD_FLAG_WAIT;
-
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
@@ -188,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
arch_send_call_function_single_ipi(cpu);
- if (wait)
- csd_lock_wait(csd);
-
return 0;
}
@@ -228,7 +218,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
WARN_ON(!irqs_disabled());
- head = &__get_cpu_var(call_single_queue);
+ head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
@@ -248,9 +238,26 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
}
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
- csd->func(csd->info);
- csd_unlock(csd);
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ /* Do we wait until *after* callback? */
+ if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ func(info);
+ csd_unlock(csd);
+ } else {
+ csd_unlock(csd);
+ func(info);
+ }
}
+
+ /*
+ * Handle irq works queued remotely by irq_work_queue_on().
+ * Smp functions above are typically synchronous so they
+ * better run first since some other CPUs may be busy waiting
+ * for them.
+ */
+ irq_work_run();
}
/*
@@ -264,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
+ struct call_single_data *csd;
+ struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
int this_cpu;
int err;
@@ -282,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
- err = generic_exec_single(cpu, NULL, func, info, wait);
+ csd = &csd_stack;
+ if (!wait) {
+ csd = this_cpu_ptr(&csd_data);
+ csd_lock(csd);
+ }
+
+ err = generic_exec_single(cpu, csd, func, info);
+
+ if (wait)
+ csd_lock_wait(csd);
put_cpu();
@@ -311,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd)
int err = 0;
preempt_disable();
- err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+
+ /* We could deadlock if we have to wait here with interrupts disabled! */
+ if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
+ csd_lock_wait(csd);
+
+ csd->flags = CSD_FLAG_LOCK;
+ smp_wmb();
+
+ err = generic_exec_single(cpu, csd, csd->func, csd->info);
preempt_enable();
return err;
@@ -410,7 +436,7 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- cfd = &__get_cpu_var(cfd_data);
+ cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -423,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask,
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
+ if (wait)
+ csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
@@ -661,7 +689,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
- WARN_ON_ONCE(!ret);
+ WARN_ON_ONCE(ret);
}
preempt_enable();
}
@@ -690,3 +718,24 @@ void kick_all_cpus_sync(void)
smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
+
+/**
+ * wake_up_all_idle_cpus - break all cpus out of idle
+ * wake_up_all_idle_cpus try to break all cpus which is in idle state even
+ * including idle polling cpus, for non-idle cpus, we will do nothing
+ * for them.
+ */
+void wake_up_all_idle_cpus(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+
+ wake_up_if_idle(cpu);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
@@ -110,7 +111,7 @@ static int smpboot_thread_fn(void *data)
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->cleanup)
ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +137,27 @@ static int smpboot_thread_fn(void *data)
/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
+
case HP_THREAD_PARKED:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
}
if (!ht->thread_should_run(td->cpu)) {
- preempt_enable();
+ preempt_enable_no_resched();
schedule();
} else {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
@@ -279,6 +281,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
+ get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -291,6 +294,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
@@ -311,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
put_online_cpus();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+ return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success. Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying. In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+ }
+
+ switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+ case CPU_POST_DEAD:
+
+ /* The CPU died properly, so just start it up again. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+
+ case CPU_DEAD_FROZEN:
+
+ /*
+ * Timeout during CPU death, so let caller know.
+ * The outgoing CPU completed its processing, but after
+ * cpu_wait_death() timed out and reported the error. The
+ * caller is free to proceed, in which case the state
+ * will be reset properly by cpu_set_state_online().
+ * Proceeding despite this -EBUSY return makes sense
+ * for systems where the outgoing CPUs take themselves
+ * offline, with no post-death manipulation required from
+ * a surviving CPU.
+ */
+ return -EBUSY;
+
+ case CPU_BROKEN:
+
+ /*
+ * The most likely reason we got here is that there was
+ * a timeout during CPU death, and the outgoing CPU never
+ * did complete its processing. This could happen on
+ * a virtualized system if the outgoing VCPU gets preempted
+ * for more than five seconds, and the user attempts to
+ * immediately online that same CPU. Trying again later
+ * might return -EBUSY above, hence -EAGAIN.
+ */
+ return -EAGAIN;
+
+ default:
+
+ /* Should not happen. Famous last words. */
+ return -EIO;
+ }
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+ (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+ int jf_left = seconds * HZ;
+ int oldstate;
+ bool ret = true;
+ int sleep_jf = 1;
+
+ might_sleep();
+
+ /* The outgoing CPU will normally get done quite quickly. */
+ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+ goto update_state;
+ udelay(5);
+
+ /* But if the outgoing CPU dawdles, wait increasingly long times. */
+ while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+ schedule_timeout_uninterruptible(sleep_jf);
+ jf_left -= sleep_jf;
+ if (jf_left <= 0)
+ break;
+ sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+ }
+update_state:
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate == CPU_DEAD) {
+ /* Outgoing CPU died normally, update state. */
+ smp_mb(); /* atomic_read() before update. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+ } else {
+ /* Outgoing CPU still hasn't died, set state accordingly. */
+ if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, CPU_BROKEN) != oldstate)
+ goto update_state;
+ ret = false;
+ }
+ return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death. Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out. This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+ int oldstate;
+ int newstate;
+ int cpu = smp_processor_id();
+
+ do {
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate != CPU_BROKEN)
+ newstate = CPU_DEAD;
+ else
+ newstate = CPU_DEAD_FROZEN;
+ } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, newstate) != oldstate);
+ return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5918d227730f..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
- if (preempt_count() == cnt)
+ if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
}
EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -278,7 +282,7 @@ restart:
pending >>= softirq_bit;
}
- rcu_bh_qs(smp_processor_id());
+ rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
@@ -485,7 +489,7 @@ static void tasklet_action(struct softirq_action *a)
local_irq_disable();
list = __this_cpu_read(tasklet_vec.head);
__this_cpu_write(tasklet_vec.head, NULL);
- __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
local_irq_enable();
while (list) {
@@ -521,7 +525,7 @@ static void tasklet_hi_action(struct softirq_action *a)
local_irq_disable();
list = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, NULL);
- __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
+ __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
local_irq_enable();
while (list) {
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch(cpu);
local_irq_enable();
- cond_resched();
+ cond_resched_rcu_qs();
return;
}
local_irq_enable();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
}
EXPORT_SYMBOL_GPL(print_stack_trace);
+int snprint_stack_trace(char *buf, size_t size,
+ struct stack_trace *trace, int spaces)
+{
+ int i;
+ unsigned long ip;
+ int generated;
+ int total = 0;
+
+ if (WARN_ON(!trace->entries))
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ ip = trace->entries[i];
+ generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+ 1 + spaces, ' ', (void *) ip, (void *) ip);
+
+ total += generated;
+
+ /* Assume that generated isn't a negative number */
+ if (generated >= size) {
+ buf += size;
+ size = 0;
+ } else {
+ buf += generated;
+ size -= generated;
+ }
+ }
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
+
/*
* Architectures that do not implement save_stack_trace_tsk or
* save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..a4e372b798a5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
#include <asm/unistd.h>
#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b) (-EINVAL)
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b) (-EINVAL)
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b) (-EINVAL)
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b) (-EINVAL)
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b) (-EINVAL)
+# define GET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b) (-EINVAL)
+# define SET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a) (-EINVAL)
@@ -91,6 +91,18 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
+#ifndef MPX_ENABLE_MANAGEMENT
+# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef MPX_DISABLE_MANAGEMENT
+# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef GET_FP_MODE
+# define GET_FP_MODE(a) (-EINVAL)
+#endif
+#ifndef SET_FP_MODE
+# define SET_FP_MODE(a,b) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -182,39 +194,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p)
- error = set_one_prio(p, niceval, error);
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- error = set_one_prio(p, niceval, error);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
- error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* For find_user() */
- break;
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -244,47 +257,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
- goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- }
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* for find_user() */
- break;
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -306,11 +320,12 @@ out_unlock:
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
*/
+#ifdef CONFIG_MULTIUSER
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
struct user_namespace *ns = current_user_ns();
@@ -364,7 +379,7 @@ error:
}
/*
- * setgid() is implemented like SysV w/ SAVED_IDS
+ * setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
@@ -442,7 +457,7 @@ static int set_user(struct cred *new)
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
@@ -503,17 +518,17 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
- * setuid() is implemented like SysV with SAVED_IDS
- *
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
* Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal
+ * like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.
+ * regain them by swapping the real and effective uid.
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
@@ -637,10 +652,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
euid = from_kuid_munged(cred->user_ns, cred->euid);
suid = from_kuid_munged(cred->user_ns, cred->suid);
- if (!(retval = put_user(ruid, ruidp)) &&
- !(retval = put_user(euid, euidp)))
- retval = put_user(suid, suidp);
-
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
return retval;
}
@@ -709,9 +726,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
egid = from_kgid_munged(cred->user_ns, cred->egid);
sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(rgid, rgidp)) &&
- !(retval = put_user(egid, egidp)))
- retval = put_user(sgid, sgidp);
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
return retval;
}
@@ -796,6 +816,7 @@ change_okay:
commit_creds(new);
return old_fsgid;
}
+#endif /* CONFIG_MULTIUSER */
/**
* sys_getpid - return the thread group id of the current process
@@ -862,11 +883,9 @@ void do_sys_times(struct tms *tms)
{
cputime_t tgutime, tgstime, cutime, cstime;
- spin_lock_irq(&current->sighand->siglock);
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- spin_unlock_irq(&current->sighand->siglock);
tms->tms_utime = cputime_to_clock_t(tgutime);
tms->tms_stime = cputime_to_clock_t(tgstime);
tms->tms_cutime = cputime_to_clock_t(cutime);
@@ -1091,6 +1110,7 @@ DECLARE_RWSEM(uts_sem);
/*
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
*/
static int override_release(char __user *release, size_t len)
{
@@ -1110,7 +1130,7 @@ static int override_release(char __user *release, size_t len)
break;
rest++;
}
- v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
+ v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
copy = clamp_t(size_t, len, 1, sizeof(buf));
copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
ret = copy_to_user(release, buf, copy + 1);
@@ -1284,7 +1304,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
-
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
struct rlimit __user *, rlim)
{
@@ -1299,7 +1318,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
x.rlim_cur = 0x7FFFFFFF;
if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
- return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
#endif
@@ -1527,7 +1546,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
cputime_t tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
- memset((char *) r, 0, sizeof *r);
+ memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
@@ -1541,41 +1560,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
return;
switch (who) {
- case RUSAGE_BOTH:
- case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
-
- if (who == RUSAGE_CHILDREN)
- break;
-
- case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
- accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
break;
- default:
- BUG();
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
}
unlock_task_sighand(p, &flags);
@@ -1585,6 +1604,7 @@ out:
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
+
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
@@ -1596,6 +1616,7 @@ out:
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
+
k_getrusage(p, who, &r);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
@@ -1631,6 +1652,7 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
+ struct file *old_exe, *exe_file;
struct inode *inode;
int err;
@@ -1654,20 +1676,25 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- down_write(&mm->mmap_sem);
-
/*
* Forbid mm->exe_file change if old file still mapped.
*/
+ exe_file = get_mm_exe_file(mm);
err = -EBUSY;
- if (mm->exe_file) {
+ if (exe_file) {
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- if (vma->vm_file &&
- path_equal(&vma->vm_file->f_path,
- &mm->exe_file->f_path))
- goto exit_unlock;
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &exe_file->f_path))
+ goto exit_err;
+ }
+
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
}
/*
@@ -1678,29 +1705,220 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit_unlock;
+ goto exit;
err = 0;
- set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
-exit_unlock:
- up_write(&mm->mmap_sem);
-
+ /* set the new file, lockless */
+ get_file(exe.file);
+ old_exe = xchg(&mm->exe_file, exe.file);
+ if (old_exe)
+ fput(old_exe);
exit:
fdput(exe);
return err;
+exit_err:
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ goto exit;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+ down_read(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- unsigned long rlim = rlimit(RLIMIT_DATA);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int error;
- if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -1733,9 +1951,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (mm->brk - addr) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
goto out;
mm->start_brk = addr;
@@ -1745,9 +1962,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (addr - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
mm->brk = addr;
@@ -1990,12 +2206,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
- current->no_new_privs = 1;
+ task_set_no_new_privs(current);
break;
case PR_GET_NO_NEW_PRIVS:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- return current->no_new_privs ? 1 : 0;
+ return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
@@ -2011,6 +2227,22 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
me->mm->def_flags &= ~VM_NOHUGEPAGE;
up_write(&me->mm->mmap_sem);
break;
+ case PR_MPX_ENABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_ENABLE_MANAGEMENT(me);
+ break;
+ case PR_MPX_DISABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_DISABLE_MANAGEMENT(me);
+ break;
+ case PR_SET_FP_MODE:
+ error = SET_FP_MODE(me, arg2);
+ break;
+ case PR_GET_FP_MODE:
+ error = GET_FP_MODE(me);
+ break;
default:
error = -EINVAL;
break;
@@ -2023,6 +2255,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
{
int err = 0;
int cpu = raw_smp_processor_id();
+
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
@@ -2135,7 +2368,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
/* Check to see if any memory value is too large for 32-bit and scale
* down if needed
*/
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
int bitcount = 0;
while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
cond_syscall(sys_init_module);
cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
@@ -155,6 +156,23 @@ cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
@@ -165,6 +183,8 @@ cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
/* mmu depending weak syscall entries */
cond_syscall(sys_mprotect);
@@ -197,6 +217,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);
@@ -213,3 +234,12 @@ cond_syscall(compat_sys_open_by_handle_at);
/* compare kernel pointers */
cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..2082b1a88fb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,6 +19,7 @@
*/
#include <linux/module.h>
+#include <linux/aio.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
@@ -92,11 +93,9 @@
#include <linux/nmi.h>
#endif
-
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
-extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
extern int core_uses_pid;
@@ -387,7 +386,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "numa_balancing",
@@ -622,6 +622,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#ifdef CONFIG_KEXEC
{
@@ -701,10 +708,10 @@ static struct ctl_table kern_table[] = {
#endif
{
.procname = "threads-max",
- .data = &max_threads,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_max_threads,
},
{
.procname = "random",
@@ -838,7 +845,7 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_user_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
},
@@ -847,11 +854,33 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog_thresh,
.extra1 = &zero,
.extra2 = &sixty,
},
{
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+ .extra2 = &one,
+#else
+ .extra2 = &zero,
+#endif
+ },
+ {
+ .procname = "soft_watchdog",
+ .data = &soft_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -871,15 +900,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_SMP */
- {
- .procname = "nmi_watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dowatchdog,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
@@ -1055,15 +1075,6 @@ static struct ctl_table kern_table[] = {
.child = key_sysctls,
},
#endif
-#ifdef CONFIG_RCU_TORTURE_TEST
- {
- .procname = "rcutorture_runnable",
- .data = &rcutorture_runnable,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
/*
* User-space scripts rely on the existence of this file
@@ -1112,6 +1123,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
@@ -1220,6 +1240,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "nr_pdflush_threads",
.mode = 0444 /* read-only */,
.proc_handler = pdflush_proc_obsolete,
@@ -1240,8 +1268,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
},
#ifdef CONFIG_NUMA
{
@@ -1250,8 +1276,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
},
#endif
{
@@ -1274,8 +1298,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
},
#endif
{
@@ -1311,6 +1333,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
+ {
+ .procname = "compact_unevictable_allowed",
+ .data = &sysctl_compact_unevictable_allowed,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_COMPACTION */
{
@@ -1463,13 +1494,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
- {
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
- .mode = 0644,
- .proc_handler = scan_unevictable_handler,
- },
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
@@ -1957,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- *valp = *negp ? -*lvalp : *lvalp;
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
} else {
int val = *valp;
if (val < 0) {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
{}
};
@@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
- { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
{ CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
{ CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
{ CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
{ CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
{ CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
{ CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
{}
};
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
PTR_ERR(key));
} else {
+ set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
pr_notice("Loaded X.509 cert '%s'\n",
key_ref_to_ptr(key)->description);
key_ref_put(key);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7cd65db..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
{
struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
void *reply = genlmsg_data(genlhdr);
- int rc;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return rc;
- }
+ genlmsg_end(skb, reply);
return genlmsg_reply(skb, info);
}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
void *reply = genlmsg_data(genlhdr);
int rc, delcount = 0;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return;
- }
+ genlmsg_end(skb, reply);
rc = 0;
down_read(&listeners->sem);
@@ -459,7 +450,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, f.file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
@@ -638,7 +629,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
fill_tgid_exit(tsk);
}
- listeners = __this_cpu_ptr(&listener_array);
+ listeners = raw_cpu_ptr(&listener_array);
if (list_empty(&listeners->list))
return;
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
* the GNU General Public License for more details.
*/
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler\n");
+ pr_err("incorrect value in post_handler\n");
}
posth_val = preh_val + div_factor;
}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
ret = register_kprobe(&kp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobe returned %d\n", ret);
+ pr_err("register_kprobe returned %d\n", ret);
return ret;
}
@@ -68,14 +68,12 @@ static int test_kprobe(void)
unregister_kprobe(&kp);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler2\n");
+ pr_err("incorrect value in post_handler2\n");
}
posth_val = preh_val + div_factor;
}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
kp.flags = 0;
ret = register_kprobes(kps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobes returned %d\n", ret);
+ pr_err("register_kprobes returned %d\n", ret);
return ret;
}
@@ -130,14 +126,12 @@ static int test_kprobes(void)
ret = target(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -146,14 +140,12 @@ static int test_kprobes(void)
ret = target2(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler2 not called\n");
+ pr_err("kprobe pre_handler2 not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler2 not called\n");
+ pr_err("kprobe post_handler2 not called\n");
handler_errors++;
}
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
{
if (value != rand1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in jprobe handler\n");
+ pr_err("incorrect value in jprobe handler\n");
}
jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
ret = register_jprobe(&jp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobe returned %d\n", ret);
+ pr_err("register_jprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_jprobe(&jp);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
@@ -217,24 +206,21 @@ static int test_jprobes(void)
jp.kp.flags = 0;
ret = register_jprobes(jps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobes returned %d\n", ret);
+ pr_err("register_jprobes returned %d\n", ret);
return ret;
}
jph_val = 0;
ret = target(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
jph_val = 0;
ret = target2(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler2 not called\n");
+ pr_err("jprobe handler2 not called\n");
handler_errors++;
}
unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler\n");
+ pr_err("incorrect value in kretprobe handler\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
ret = register_kretprobe(&rp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_kretprobe(&rp);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler2\n");
+ pr_err("incorrect value in kretprobe handler2\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
rp.kp.flags = 0;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 not called\n");
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
rand1 = prandom_u32();
} while (rand1 <= div_factor);
- printk(KERN_INFO "Kprobe smoke test started\n");
+ pr_info("started\n");
num_tests++;
ret = test_kprobe();
if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
#endif /* CONFIG_KRETPROBES */
if (errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
- "%d tests failed\n", errors, num_tests);
+ pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
else if (handler_errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
- "running handlers\n", handler_errors);
+ pr_err("BUG: %d error(s) running handlers\n", handler_errors);
else
- printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+ pr_info("passed successfully\n");
return 0;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f448513a45ed..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
config ARCH_CLOCKSOURCE_DATA
bool
+# Clocksources require validation of the clocksource against the last
+# cycle update - x86/TSC misfeature
+config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
config GENERIC_TIME_VSYSCALL_OLD
bool
-# ktime_t scalar 64bit nsec representation
-config KTIME_SCALAR
- bool
-
# Old style timekeeping
config ARCH_USES_GETTIMEOFFSET
bool
@@ -32,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
config GENERIC_CLOCKEVENTS
bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
- bool
- default y
- depends on GENERIC_CLOCKEVENTS
-
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,14 +1,31 @@
+obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
obj-y += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
+obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
+targets += timeconst.h
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fe75444ae7ec..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
return ret;
}
-
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev,
struct class_interface *class_intf)
@@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
{
+ unsigned long flags;
struct k_itimer *ptr = container_of(alarm, struct k_itimer,
it.alarm.alarmtimer);
- if (posix_timer_event(ptr, 0) != 0)
- ptr->it_overrun++;
+ enum alarmtimer_restart result = ALARMTIMER_NORESTART;
+
+ spin_lock_irqsave(&ptr->it_lock, flags);
+ if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
+ if (posix_timer_event(ptr, 0) != 0)
+ ptr->it_overrun++;
+ }
/* Re-add periodic timers */
if (ptr->it.alarm.interval.tv64) {
ptr->it_overrun += alarm_forward(alarm, now,
ptr->it.alarm.interval);
- return ALARMTIMER_RESTART;
+ result = ALARMTIMER_RESTART;
}
- return ALARMTIMER_NORESTART;
+ spin_unlock_irqrestore(&ptr->it_lock, flags);
+
+ return result;
}
/**
@@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer)
* @new_timer: k_itimer pointer
* @cur_setting: itimerspec data to fill
*
- * Copies the itimerspec data out from the k_itimer
+ * Copies out the current itimerspec data
*/
static void alarm_timer_get(struct k_itimer *timr,
struct itimerspec *cur_setting)
{
- memset(cur_setting, 0, sizeof(struct itimerspec));
+ ktime_t relative_expiry_time =
+ alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
+
+ if (ktime_to_ns(relative_expiry_time) > 0) {
+ cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+ } else {
+ cur_setting->it_value.tv_sec = 0;
+ cur_setting->it_value.tv_nsec = 0;
+ }
- cur_setting->it_interval =
- ktime_to_timespec(timr->it.alarm.interval);
- cur_setting->it_value =
- ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
- return;
+ cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
}
/**
@@ -776,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..11dc22a6983b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
* Also omit the add if it would overflow the u64 boundary.
*/
if ((~0ULL - clc > rnd) &&
- (!ismax || evt->mult <= (1U << evt->shift)))
+ (!ismax || evt->mult <= (1ULL << evt->shift)))
clc += rnd;
do_div(clc, evt->mult);
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ /* Transition with legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* Legacy callback doesn't support new modes */
+ if (state > CLOCK_EVT_STATE_ONESHOT)
+ return -ENOSYS;
+ /*
+ * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+ * mapping until *_ONESHOT, and so a simple cast will work.
+ */
+ dev->set_mode((enum clock_event_mode)state, dev);
+ dev->mode = (enum clock_event_mode)state;
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* Transition with new state-specific callbacks */
+ switch (state) {
+ case CLOCK_EVT_STATE_DETACHED:
+ /*
+ * This is an internal state, which is guaranteed to go from
+ * SHUTDOWN to DETACHED. No driver interaction required.
+ */
+ return 0;
+
+ case CLOCK_EVT_STATE_SHUTDOWN:
+ return dev->set_state_shutdown(dev);
+
+ case CLOCK_EVT_STATE_PERIODIC:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+ return -ENOSYS;
+ return dev->set_state_periodic(dev);
+
+ case CLOCK_EVT_STATE_ONESHOT:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return -ENOSYS;
+ return dev->set_state_oneshot(dev);
+
+ default:
+ return -ENOSYS;
+ }
+}
+
/**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
* @dev: device to modify
- * @mode: new mode
+ * @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_mode(struct clock_event_device *dev,
- enum clock_event_mode mode)
+void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->mode != mode) {
- dev->set_mode(mode, dev);
- dev->mode = mode;
+ if (dev->state != state) {
+ if (__clockevents_set_state(dev, state))
+ return;
+
+ dev->state = state;
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (state == CLOCK_EVT_STATE_ONESHOT) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
+/**
+ * clockevents_tick_resume - Resume the tick device before using it again
+ * @dev: device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+ int ret = 0;
+
+ if (dev->set_mode) {
+ dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+ dev->mode = CLOCK_EVT_MODE_RESUME;
+ } else if (dev->tick_resume) {
+ ret = dev->tick_resume(dev);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
/* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
/* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+ if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
continue;
if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+ if (ced->state == CLOCK_EVT_STATE_DETACHED) {
list_del_init(&ced->list);
return 0;
}
@@ -371,7 +440,38 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
mutex_unlock(&clockevents_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_unbind);
+EXPORT_SYMBOL_GPL(clockevents_unbind_device);
+
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+ /* Legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* We shouldn't be supporting new modes now */
+ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+ dev->set_state_shutdown || dev->tick_resume);
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* New state-specific callbacks */
+ if (!dev->set_state_shutdown)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !dev->set_state_periodic)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !dev->set_state_oneshot)
+ return -EINVAL;
+
+ return 0;
+}
/**
* clockevents_register_device - register a clock event device
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(clockevents_sanity_check(dev));
+
+ /* Initialize state to DETACHED */
+ dev->state = CLOCK_EVT_STATE_DETACHED;
+
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state == CLOCK_EVT_STATE_ONESHOT)
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+ return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
* @old: device to release (can be NULL)
* @new: device to request (can be NULL)
*
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
*/
void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new)
{
- unsigned long flags;
-
- local_irq_save(flags);
/*
* Caller releases a clock event device. We queue it into the
* released list and do a notify add later.
*/
if (old) {
module_put(old->owner);
- clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
clockevents_shutdown(new);
}
- local_irq_restore(flags);
}
/**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
dev->resume(dev);
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
/**
- * clockevents_notify - notification about relevant events
- * Returns 0 on success, any other value on error
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- switch (reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- tick_broadcast_on_off(reason, arg);
- break;
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
- case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- ret = tick_broadcast_oneshot_control(reason);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DYING:
- tick_handover_do_timer(arg);
- break;
-
- case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend();
- tick_suspend_broadcast();
- break;
-
- case CLOCK_EVT_NOTIFY_RESUME:
- tick_resume();
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DEAD:
- tick_shutdown_broadcast_oneshot(arg);
- tick_shutdown_broadcast(arg);
- tick_shutdown(arg);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ tick_shutdown_broadcast_oneshot(cpu);
+ tick_shutdown_broadcast(cpu);
+ tick_shutdown(cpu);
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
list_del(&dev->list);
- /*
- * Now check whether the CPU has left unused per cpu devices
- */
- cpu = *((int *)arg);
- list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
- if (cpumask_test_cpu(cpu, dev->cpumask) &&
- cpumask_weight(dev->cpumask) == 1 &&
- !tick_is_broadcast_device(dev)) {
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- list_del(&dev->list);
- }
}
- break;
- default:
- break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
- return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
#ifdef CONFIG_SYSFS
struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */
-
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,82 +32,7 @@
#include <linux/kthread.h>
#include "tick-internal.h"
-
-void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
- u64 start_tstamp)
-{
- tc->cc = cc;
- tc->cycle_last = cc->read(cc);
- tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc: Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
- cycle_t cycle_now, cycle_delta;
- u64 ns_offset;
-
- /* read cycle counter: */
- cycle_now = tc->cc->read(tc->cc);
-
- /* calculate the delta since the last timecounter_read_delta(): */
- cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-
- /* update time stamp of timecounter_read_delta() call: */
- tc->cycle_last = cycle_now;
-
- return ns_offset;
-}
-
-u64 timecounter_read(struct timecounter *tc)
-{
- u64 nsec;
-
- /* increment time by nanoseconds since last call */
- nsec = timecounter_read_delta(tc);
- nsec += tc->nsec;
- tc->nsec = nsec;
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-
-u64 timecounter_cyc2time(struct timecounter *tc,
- cycle_t cycle_tstamp)
-{
- u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
- u64 nsec;
-
- /*
- * Instead of always treating cycle_tstamp as more recent
- * than tc->cycle_last, detect when it is too far in the
- * future and treat it as old time stamp instead.
- */
- if (cycle_delta > tc->cc->mask / 2) {
- cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
- nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
- } else {
- nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
- }
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+#include "timekeeping_internal.h"
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
@@ -217,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
schedule_work(&watchdog_work);
}
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
- printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
- cs->name, delta);
- __clocksource_unstable(cs);
-}
-
/**
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
@@ -249,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow;
+ cycle_t csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -282,11 +200,14 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
- wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
- watchdog->mult, watchdog->shift);
+ delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
+ wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
- cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
- cs->mask, cs->mult, cs->shift);
+ delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
cs->cs_last = csnow;
cs->wd_last = wdnow;
@@ -295,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- clocksource_unstable(cs, cs_nsec - wd_nsec);
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
continue;
}
@@ -543,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* @shift: cycle to nanosecond divisor (power of two)
* @maxadj: maximum adjustment value to mult (~11%)
* @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc: maximum cycle value before potential overflow (does not include
+ * any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
*/
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
- * cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
- * which is equivalent to the below.
- * max_cycles < (2^63)/(mult + maxadj)
- * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
- * max_cycles < 2^(63 - log2(mult + maxadj))
- * max_cycles < 1 << (63 - log2(mult + maxadj))
- * Please note that we add 1 to the result of the log2 to account for
- * any rounding errors, ensure the above inequality is satisfied and
- * no overflow will occur.
+ * cyc2ns() function without overflowing a 64-bit result.
*/
- max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+ max_cycles = ULLONG_MAX;
+ do_div(max_cycles, mult+maxadj);
/*
* The actual maximum number of cycles we can defer the clocksource is
@@ -573,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
max_cycles = min(max_cycles, mask);
max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+ /* return the max_cycles value as well if requested */
+ if (max_cyc)
+ *max_cyc = max_cycles;
+
+ /* Return 50% of the actual maximum, so we can detect bad values */
+ max_nsecs >>= 1;
+
return max_nsecs;
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs: Pointer to clocksource to be updated
*
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
{
- u64 max_nsecs;
-
- max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
- cs->mask);
- /*
- * To ensure that the clocksource does not wrap whilst we are idle,
- * limit the time the clocksource can be deferred by 12.5%. Please
- * note a margin of 12.5% is used because this can be computed with
- * a shift, versus say 10% which would require division.
- */
- return max_nsecs - (max_nsecs >> 3);
+ cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+ cs->maxadj, cs->mask,
+ &cs->max_cycles);
}
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -722,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
}
/**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
@@ -730,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
* This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
*/
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
+
/*
- * Calc the maximum number of seconds which we can run before
- * wrapping around. For clocksources which have a mask > 32bit
- * we need to limit the max sleep time to have a good
- * conversion precision. 10 minutes is still a reasonable
- * amount. That results in a shift value of 24 for a
- * clocksource with mask >= 40bit and f >= 4GHz. That maps to
- * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
- * margin as we do in clocksource_max_deferment()
+ * Default clocksources are *special* and self-define their mult/shift.
+ * But, you're not special, so you should specify a freq value.
*/
- sec = (cs->mask - (cs->mask >> 3));
- do_div(sec, freq);
- do_div(sec, scale);
- if (!sec)
- sec = 1;
- else if (sec > 600 && cs->mask > UINT_MAX)
- sec = 600;
-
- clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
- NSEC_PER_SEC / scale, sec * scale);
-
+ if (freq) {
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32-bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP.
+ */
+ sec = cs->mask;
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+ }
/*
- * for clocksources that have large mults, to avoid overflow.
- * Since mult may be adjusted by ntp, add an safety extra margin
- *
+ * Ensure clocksources that have large 'mult' values don't overflow
+ * when adjusted.
*/
cs->maxadj = clocksource_max_adjustment(cs);
- while ((cs->mult + cs->maxadj < cs->mult)
- || (cs->mult - cs->maxadj > cs->mult)) {
+ while (freq && ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult))) {
cs->mult >>= 1;
cs->shift--;
cs->maxadj = clocksource_max_adjustment(cs);
}
- cs->max_idle_ns = clocksource_max_deferment(cs);
+ /*
+ * Only warn for *special* clocksources that self-define
+ * their mult/shift values and don't specify a freq.
+ */
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ clocksource_update_max_deferment(cs);
+
+ pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
@@ -788,9 +728,9 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
/* Initialize mult/shift and max_idle_ns */
- __clocksource_updatefreq_scale(cs, scale, freq);
+ __clocksource_update_freq_scale(cs, scale, freq);
- /* Add clocksource to the clcoksource list */
+ /* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
@@ -800,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs: clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
- /* calculate max adjustment for given mult/shift */
- cs->maxadj = clocksource_max_adjustment(cs);
- WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
- "Clocksource %s might overflow on 11%% adjustment\n",
- cs->name);
-
- /* calculate max idle time permitted for this clocksource */
- cs->max_idle_ns = clocksource_max_deferment(cs);
-
- mutex_lock(&clocksource_mutex);
- clocksource_enqueue(cs);
- clocksource_enqueue_watchdog(cs);
- clocksource_select();
- mutex_unlock(&clocksource_mutex);
- return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
list_del(&cs->list);
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c
index 3ab28993f6e0..76d4bd962b19 100644
--- a/kernel/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,6 +54,8 @@
#include <trace/events/timer.h>
+#include "tick-internal.h"
+
/*
* The timer bases:
*
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
*/
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
- ktime_t xtim, mono, boot;
- struct timespec xts, tom, slp;
- s32 tai_offset;
+ ktime_t xtim, mono, boot, tai;
+ ktime_t off_real, off_boot, off_tai;
- get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
- tai_offset = timekeeping_get_tai_offset();
+ mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
+ boot = ktime_add(mono, off_boot);
+ xtim = ktime_add(mono, off_real);
+ tai = ktime_add(mono, off_tai);
- xtim = timespec_to_ktime(xts);
- mono = ktime_add(xtim, timespec_to_ktime(tom));
- boot = ktime_add(mono, timespec_to_ktime(slp));
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
- base->clock_base[HRTIMER_BASE_TAI].softirq_time =
- ktime_add(xtim, ktime_set(tai_offset, 0));
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
}
/*
@@ -264,64 +263,10 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* too large for inlining:
*/
#if BITS_PER_LONG < 64
-# ifndef CONFIG_KTIME_SCALAR
-/**
- * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- * @kt: addend
- * @nsec: the scalar nsec value to add
- *
- * Returns the sum of kt and nsec in ktime_t format
- */
-ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
-{
- ktime_t tmp;
-
- if (likely(nsec < NSEC_PER_SEC)) {
- tmp.tv64 = nsec;
- } else {
- unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-
- /* Make sure nsec fits into long */
- if (unlikely(nsec > KTIME_SEC_MAX))
- return (ktime_t){ .tv64 = KTIME_MAX };
-
- tmp = ktime_set((long)nsec, rem);
- }
-
- return ktime_add(kt, tmp);
-}
-
-EXPORT_SYMBOL_GPL(ktime_add_ns);
-
-/**
- * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
- * @kt: minuend
- * @nsec: the scalar nsec value to subtract
- *
- * Returns the subtraction of @nsec from @kt in ktime_t format
- */
-ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
-{
- ktime_t tmp;
-
- if (likely(nsec < NSEC_PER_SEC)) {
- tmp.tv64 = nsec;
- } else {
- unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-
- tmp = ktime_set((long)nsec, rem);
- }
-
- return ktime_sub(kt, tmp);
-}
-
-EXPORT_SYMBOL_GPL(ktime_sub_ns);
-# endif /* !CONFIG_KTIME_SCALAR */
-
/*
* Divide a ktime value by a nanosecond value
*/
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
{
u64 dclc;
int sft = 0;
@@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
return dclc;
}
+EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
/*
@@ -494,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+ int i;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct timerqueue_node *next;
+ struct hrtimer *timer;
+
+ next = timerqueue_getnext(&base->active);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ }
+ /*
+ * clock_was_set() might have changed base->offset of any of
+ * the clock bases so the result might be negative. Fix it up
+ * to prevent a false positive in clockevents_program_event().
+ */
+ if (expires_next.tv64 < 0)
+ expires_next.tv64 = 0;
+ return expires_next;
+}
+#endif
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -542,32 +519,7 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- int i;
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t expires, expires_next;
-
- expires_next.tv64 = KTIME_MAX;
-
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
- timer = container_of(next, struct hrtimer, node);
-
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- /*
- * clock_was_set() has changed base->offset so the
- * result might be negative. Fix it up to prevent a
- * false positive in clockevents_program_event()
- */
- if (expires.tv64 < 0)
- expires.tv64 = 0;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
- }
+ ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -602,12 +554,17 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
+ * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
+ * and no expiry check happens. The timer gets enqueued into the rbtree. The
+ * reprogramming and expiry check is done in the hrtimer_interrupt or in the
+ * softirq.
+ *
* Called with interrupts disabled and base->cpu_base.lock held
*/
static int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
int res;
@@ -636,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return 0;
/*
+ * When the target cpu of the timer is currently executing
+ * hrtimer_interrupt(), then we do not touch the clock event
+ * device. hrtimer_interrupt() will reevaluate all clock bases
+ * before reprogramming the device.
+ */
+ if (cpu_base->in_hrtirq)
+ return 0;
+
+ /*
* If a hang was detected in the last timer interrupt then we
* do not schedule a timer which is earlier than the expiry
* which we enforced in the hang detection. We want the system
@@ -662,25 +628,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
base->hres_active = 0;
}
-/*
- * When High resolution timers are active, try to reprogram. Note, that in case
- * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
- * check happens. The timer gets enqueued into the rbtree. The reprogramming
- * and expiry check is done in the hrtimer_interrupt or in the softirq.
- */
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
-}
-
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
+ return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
}
/*
@@ -690,7 +644,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
*/
static void retrigger_next_event(void *arg)
{
- struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (!hrtimer_hres_active())
return;
@@ -755,8 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -964,7 +918,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
*/
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
- reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+ reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
/*
* We must preserve the CALLBACK state flag here,
* otherwise we could move the timer base in
@@ -1013,14 +967,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
leftmost = enqueue_hrtimer(timer, new_base);
- /*
- * Only allow reprogramming if the new base is on this CPU.
- * (it might still be on another CPU if the timer was pending)
- *
- * XXX send_remote_softirq() ?
- */
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
- && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (!leftmost) {
+ unlock_hrtimer_base(timer, &flags);
+ return ret;
+ }
+
+ if (!hrtimer_is_hres_active(timer)) {
+ /*
+ * Kick to reschedule the next tick to handle the new timer
+ * on dynticks target.
+ */
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
+ hrtimer_reprogram(timer, new_base)) {
+ /*
+ * Only allow reprogramming if the new base is on this CPU.
+ * (it might still be on another CPU if the timer was pending)
+ *
+ * XXX send_remote_softirq() ?
+ */
if (wakeup) {
/*
* We need to drop cpu_base->lock to avoid a
@@ -1153,30 +1118,15 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
*/
ktime_t hrtimer_get_next_event(void)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t mindelta = { .tv64 = KTIME_MAX };
unsigned long flags;
- int i;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active()) {
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- delta.tv64 = hrtimer_get_expires_tv64(timer);
- delta = ktime_sub(delta, base->get_time());
- if (delta.tv64 < mindelta.tv64)
- mindelta.tv64 = delta.tv64;
- }
- }
+ if (!hrtimer_hres_active())
+ mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+ ktime_get());
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1194,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
memset(timer, 0, sizeof(struct hrtimer));
- cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
clock_id = CLOCK_MONOTONIC;
@@ -1237,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
struct hrtimer_cpu_base *cpu_base;
int base = hrtimer_clockid_to_base(which_clock);
- cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
return 0;
@@ -1292,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
*/
void hrtimer_interrupt(struct clock_event_device *dev)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
@@ -1303,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
raw_spin_lock(&cpu_base->lock);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- expires_next.tv64 = KTIME_MAX;
+ cpu_base->in_hrtirq = 1;
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1341,28 +1291,20 @@ retry:
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
-
- if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (expires.tv64 < 0)
- expires.tv64 = KTIME_MAX;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
break;
- }
__run_hrtimer(timer, &basenow);
}
}
-
+ /* Reevaluate the clock bases for the next expiry */
+ expires_next = __hrtimer_get_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
+ cpu_base->in_hrtirq = 0;
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
@@ -1426,7 +1368,7 @@ static void __hrtimer_peek_ahead_timers(void)
if (!hrtimer_hres_active())
return;
- td = &__get_cpu_var(tick_cpu_device);
+ td = this_cpu_ptr(&tick_cpu_device);
if (td && td->evtdev)
hrtimer_interrupt(td->evtdev);
}
@@ -1490,7 +1432,7 @@ void hrtimer_run_pending(void)
void hrtimer_run_queues(void)
{
struct timerqueue_node *node;
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base;
int index, gettime = 1;
@@ -1641,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
@@ -1680,6 +1622,7 @@ static void init_hrtimers_cpu(int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
}
@@ -1728,7 +1671,7 @@ static void migrate_hrtimers(int scpu)
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = &__get_cpu_var(hrtimer_bases);
+ new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1764,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
- break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- {
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
migrate_hrtimers(scpu);
break;
- }
#endif
default:
@@ -1825,7 +1761,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
*/
if (!expires) {
schedule();
- __set_current_state(TASK_RUNNING);
return -EINTR;
}
diff --git a/kernel/itimer.c b/kernel/time/itimer.c
index 8d262b467573..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/time/itimer.c
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
#include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
static int __init init_jiffies_clocksource(void)
{
- return clocksource_register(&clocksource_jiffies);
+ return __clocksource_register(&clocksource_jiffies);
}
core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
- clocksource_register(&refined_jiffies);
+ __clocksource_register(&refined_jiffies);
return 0;
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 33db43a39515..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/rtc.h>
-#include "tick-internal.h"
#include "ntp_internal.h"
/*
@@ -459,6 +458,16 @@ out:
return leap;
}
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+ struct timespec now;
+
+ now = timespec64_to_timespec(now64);
+ return update_persistent_clock(now);
+}
+#endif
+
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
@@ -466,7 +475,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
static void sync_cmos_clock(struct work_struct *work)
{
- struct timespec now, next;
+ struct timespec64 now;
+ struct timespec next;
int fail = 1;
/*
@@ -485,16 +495,17 @@ static void sync_cmos_clock(struct work_struct *work)
return;
}
- getnstimeofday(&now);
+ getnstimeofday64(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec adjust = now;
+ struct timespec64 adjust = now;
fail = -ENODEV;
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock(adjust);
+ fail = update_persistent_clock64(adjust);
#endif
+
#ifdef CONFIG_RTC_SYSTOHC
if (fail == -ENODEV)
fail = rtc_set_ntp_time(adjust);
@@ -531,7 +542,7 @@ void ntp_notify_cmos_timer(void) { }
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
@@ -554,7 +565,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
static inline void process_adjtimex_modes(struct timex *txc,
- struct timespec *ts,
+ struct timespec64 *ts,
s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
@@ -632,6 +643,17 @@ int ntp_validate_timex(struct timex *txc)
if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
return -EPERM;
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -640,7 +662,7 @@ int ntp_validate_timex(struct timex *txc)
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
{
int result;
@@ -684,7 +706,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
/* fill PPS status fields */
pps_fill_timex(txc);
- txc->time.tv_sec = ts->tv_sec;
+ txc->time.tv_sec = (time_t)ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
-extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec *, const struct timespec *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..0075da74abf0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
if (same_thread_group(tsk, current))
err = cpu_clock_sample(which_clock, tsk, &rtn);
} else {
- unsigned long flags;
- struct sighand_struct *sighand;
-
- /*
- * while_each_thread() is not yet entirely RCU safe,
- * keep locking the group while sampling process
- * clock for now.
- */
- sighand = lock_task_sighand(tsk, &flags);
- if (!sighand)
- return err;
-
if (tsk == current || thread_group_leader(tsk))
err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
- unlock_task_sighand(tsk, &flags);
}
if (!err)
@@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
*sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- *sample = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
@@ -1348,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
struct timespec *rqtp, struct timespec __user *rmtp)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ struct restart_block *restart_block = &current->restart_block;
struct itimerspec it;
int error;
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c
index 424c2d4265c9..31ea01f42e1f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,8 @@
#include <linux/export.h>
#include <linux/hashtable.h>
+#include "timekeeping.h"
+
/*
* Management arrays for POSIX timers. Timers are now kept in static hash table
* with 512 entries.
@@ -634,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
goto out;
}
} else {
+ memset(&event.sigev_value, 0, sizeof(event.sigev_value));
event.sigev_notify = SIGEV_SIGNAL;
event.sigev_signo = SIGALRM;
event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
/*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ * hardware time counters to full 64-bit ns values.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
#include <linux/seqlock.h>
#include <linux/bitops.h>
-struct clock_data {
- ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns: sched_clock() value at last update
+ * @epoch_cyc: Clock cycle value at last update.
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multipler for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
u64 epoch_ns;
u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
+ u64 sched_clock_mask;
+ u64 (*read_sched_clock)(void);
u32 mult;
u32 shift;
- bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_t seq;
+ struct clock_read_data read_data[2];
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
};
static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
- .mult = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
static u64 notrace jiffy_sched_clock_read(void)
{
/*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+ .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read, },
+ .actual_read_sched_clock = jiffy_sched_clock_read,
+};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 cyc;
+ u64 cyc, res;
unsigned long seq;
-
- if (cd.suspended)
- return cd.epoch_ns;
+ struct clock_read_data *rd;
do {
- seq = raw_read_seqcount_begin(&cd.seq);
- epoch_cyc = cd.epoch_cyc;
- epoch_ns = cd.epoch_ns;
+ seq = raw_read_seqcount(&cd.seq);
+ rd = cd.read_data + (seq & 1);
+
+ cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+ rd->sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
} while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock();
- cyc = (cyc - epoch_cyc) & sched_clock_mask;
- return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+ return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ /* steer readers towards the odd copy */
+ raw_write_seqcount_latch(&cd.seq);
+
+ /* now its safe for us to update the normal (even) copy */
+ cd.read_data[0] = *rd;
+
+ /* switch readers back to the even copy */
+ raw_write_seqcount_latch(&cd.seq);
}
/*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
*/
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
{
- unsigned long flags;
u64 cyc;
u64 ns;
+ struct clock_read_data rd;
+
+ rd = cd.read_data[0];
+
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+ rd.epoch_ns = ns;
+ rd.epoch_cyc = cyc;
- cyc = read_sched_clock();
- ns = cd.epoch_ns +
- cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
-
- raw_local_irq_save(flags);
- raw_write_seqcount_begin(&cd.seq);
- cd.epoch_ns = ns;
- cd.epoch_cyc = cyc;
- raw_write_seqcount_end(&cd.seq);
- raw_local_irq_restore(flags);
+ update_clock_read_data(&rd);
}
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
update_sched_clock();
hrtimer_forward_now(hrt, cd.wrap_kt);
+
return HRTIMER_RESTART;
}
-void __init sched_clock_register(u64 (*read)(void), int bits,
- unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- ktime_t new_wrap_kt;
unsigned long r;
char r_unit;
+ struct clock_read_data rd;
if (cd.rate > rate)
return;
WARN_ON(!irqs_disabled());
- /* calculate the mult/shift to convert counter ticks to ns. */
+ /* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits);
+ cd.rate = rate;
+
+ /* Calculate how many nanosecs until we risk wrapping */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+ cd.wrap_kt = ns_to_ktime(wrap);
- /* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
- new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0];
- /* update epoch for new counter and update epoch_ns from old counter*/
+ /* Update epoch for new counter and update 'epoch_ns' from old counter*/
new_epoch = read();
- cyc = read_sched_clock();
- ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq);
- read_sched_clock = read;
- sched_clock_mask = new_mask;
- cd.rate = rate;
- cd.wrap_kt = new_wrap_kt;
- cd.mult = new_mult;
- cd.shift = new_shift;
- cd.epoch_cyc = new_epoch;
- cd.epoch_ns = ns;
- raw_write_seqcount_end(&cd.seq);
+ rd.read_sched_clock = read;
+ rd.sched_clock_mask = new_mask;
+ rd.mult = new_mult;
+ rd.shift = new_shift;
+ rd.epoch_cyc = new_epoch;
+ rd.epoch_ns = ns;
+
+ update_clock_read_data(&rd);
r = rate;
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
- } else if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else
- r_unit = ' ';
-
- /* calculate the ns resolution of this counter */
+ } else {
+ if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else {
+ r_unit = ' ';
+ }
+ }
+
+ /* Calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, new_mult, new_shift);
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- /* Enable IRQ time accounting if we have a fast enough sched_clock */
+ /* Enable IRQ time accounting if we have a fast enough sched_clock() */
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
void __init sched_clock_postinit(void)
{
/*
- * If no sched_clock function has been provided at that point,
+ * If no sched_clock() function has been provided at that point,
* make it the final one one.
*/
- if (read_sched_clock == jiffy_sched_clock_read)
+ if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
}
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+ unsigned long seq = raw_read_seqcount(&cd.seq);
+
+ return cd.read_data[seq & 1].epoch_cyc;
+}
+
static int sched_clock_suspend(void)
{
+ struct clock_read_data *rd = &cd.read_data[0];
+
update_sched_clock();
hrtimer_cancel(&sched_clock_timer);
- cd.suspended = true;
+ rd->read_sched_clock = suspended_sched_clock_read;
+
return 0;
}
static void sched_clock_resume(void)
{
- cd.epoch_cyc = read_sched_clock();
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ rd->epoch_cyc = cd.actual_read_sched_clock();
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
- cd.suspended = false;
+ rd->read_sched_clock = cd.actual_read_sched_clock;
}
static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
};
static int __init sched_clock_syscore_init(void)
{
register_syscore_ops(&sched_clock_ops);
+
return 0;
}
device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/test_udelay.c
@@ -0,0 +1,168 @@
+/*
+ * udelay() test kernel module
+ *
+ * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
+ * Tests are configured by writing: USECS ITERATIONS
+ * Tests are executed by reading from the same file.
+ * Specifying usecs of 0 or negative values will run multiples tests.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#define DEFAULT_ITERATIONS 100
+
+#define DEBUGFS_FILENAME "udelay_test"
+
+static DEFINE_MUTEX(udelay_test_lock);
+static struct dentry *udelay_test_debugfs_file;
+static int udelay_test_usecs;
+static int udelay_test_iterations = DEFAULT_ITERATIONS;
+
+static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
+{
+ int min = 0, max = 0, fail_count = 0;
+ uint64_t sum = 0;
+ uint64_t avg;
+ int i;
+ /* Allow udelay to be up to 0.5% fast */
+ int allowed_error_ns = usecs * 5;
+
+ for (i = 0; i < iters; ++i) {
+ struct timespec ts1, ts2;
+ int time_passed;
+
+ ktime_get_ts(&ts1);
+ udelay(usecs);
+ ktime_get_ts(&ts2);
+ time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+
+ if (i == 0 || time_passed < min)
+ min = time_passed;
+ if (i == 0 || time_passed > max)
+ max = time_passed;
+ if ((time_passed + allowed_error_ns) / 1000 < usecs)
+ ++fail_count;
+ WARN_ON(time_passed < 0);
+ sum += time_passed;
+ }
+
+ avg = sum;
+ do_div(avg, iters);
+ seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
+ usecs, iters, usecs * 1000,
+ (usecs * 1000) - allowed_error_ns, min, avg, max);
+ if (fail_count)
+ seq_printf(s, " FAIL=%d", fail_count);
+ seq_puts(s, "\n");
+
+ return 0;
+}
+
+static int udelay_test_show(struct seq_file *s, void *v)
+{
+ int usecs;
+ int iters;
+ int ret = 0;
+
+ mutex_lock(&udelay_test_lock);
+ usecs = udelay_test_usecs;
+ iters = udelay_test_iterations;
+ mutex_unlock(&udelay_test_lock);
+
+ if (usecs > 0 && iters > 0) {
+ return udelay_test_single(s, usecs, iters);
+ } else if (usecs == 0) {
+ struct timespec ts;
+
+ ktime_get_ts(&ts);
+ seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
+ loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+ seq_puts(s, "usage:\n");
+ seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
+ seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
+ }
+
+ return ret;
+}
+
+static int udelay_test_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, udelay_test_show, inode->i_private);
+}
+
+static ssize_t udelay_test_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ char lbuf[32];
+ int ret;
+ int usecs;
+ int iters;
+
+ if (count >= sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+ lbuf[count] = '\0';
+
+ ret = sscanf(lbuf, "%d %d", &usecs, &iters);
+ if (ret < 1)
+ return -EINVAL;
+ else if (ret < 2)
+ iters = DEFAULT_ITERATIONS;
+
+ mutex_lock(&udelay_test_lock);
+ udelay_test_usecs = usecs;
+ udelay_test_iterations = iters;
+ mutex_unlock(&udelay_test_lock);
+
+ return count;
+}
+
+static const struct file_operations udelay_test_debugfs_ops = {
+ .owner = THIS_MODULE,
+ .open = udelay_test_open,
+ .read = seq_read,
+ .write = udelay_test_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init udelay_test_init(void)
+{
+ mutex_lock(&udelay_test_lock);
+ udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
+ S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ mutex_unlock(&udelay_test_lock);
+
+ return 0;
+}
+
+module_init(udelay_test_init);
+
+static void __exit udelay_test_exit(void)
+{
+ mutex_lock(&udelay_test_lock);
+ debugfs_remove(udelay_test_debugfs_file);
+ mutex_unlock(&udelay_test_lock);
+}
+
+module_exit(udelay_test_exit);
+
+MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
+ int bc_moved;
/*
* We try to cancel the timer first. If the callback is on
* flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* restart the timer because we are in the callback, but we
* can set the expiry time and let the callback return
* HRTIMER_RESTART.
+ *
+ * Since we are in the idle loop at this point and because
+ * hrtimer_{start/cancel} functions call into tracing,
+ * calls to these functions must be bound within RCU_NONIDLE.
*/
- if (hrtimer_try_to_cancel(&bctimer) >= 0) {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
+ !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+ 0);
+ if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
} else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 64c5990fd500..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
static cpumask_var_t tick_broadcast_on;
static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
#else
static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
#endif
/*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* The device is in periodic mode. No reprogramming necessary:
*/
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
goto unlock;
/*
@@ -324,49 +326,54 @@ unlock:
raw_spin_unlock(&tick_broadcast_lock);
}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode: The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
int cpu, bc_stopped;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
- bc = tick_broadcast_device.evtdev;
/*
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
if (!tick_device_is_functional(dev))
- goto out;
+ return;
+ raw_spin_lock(&tick_broadcast_lock);
+ cpu = smp_processor_id();
+ bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
- switch (*reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ switch (mode) {
+ case TICK_BROADCAST_FORCE:
+ tick_broadcast_forced = 1;
+ case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
- if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
- tick_broadcast_force = 1;
break;
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (tick_broadcast_force)
+
+ case TICK_BROADCAST_OFF:
+ if (tick_broadcast_forced)
break;
cpumask_clear_cpu(cpu, tick_broadcast_on);
if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
else
tick_broadcast_setup_oneshot(bc);
}
-out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
- if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
- printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
- "offline CPU #%d\n", *oncpu);
- else
- tick_do_broadcast_on_off(&reason);
+ raw_spin_unlock(&tick_broadcast_lock);
}
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
/*
* Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
dev->event_handler = tick_handle_periodic_broadcast;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Remove a CPU from broadcasting
*/
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
{
struct clock_event_device *bc;
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
void tick_suspend_broadcast(void)
{
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+ return false;
+ else
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
- int broadcast = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
if (bc) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+ clockevents_tick_resume(bc);
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
- broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
if (!cpumask_empty(tick_broadcast_mask))
- broadcast = tick_resume_broadcast_oneshot(bc);
+ tick_resume_broadcast_oneshot(bc);
break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
- return broadcast;
}
-
#ifdef CONFIG_TICK_ONESHOT
static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
{
int ret;
- if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
ret = clockevents_program_event(bc, expires, force);
if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
return ret;
}
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- return 0;
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -554,7 +558,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
void tick_check_oneshot_broadcast_this_cpu(void)
{
if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
/*
* We might be in the middle of switching over from
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_mode(td->evtdev,
- CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(td->evtdev,
+ CLOCK_EVT_STATE_ONESHOT);
}
}
}
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-static void broadcast_move_bc(int deadcpu)
-{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
* Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
- ktime_t now;
int cpu, ret = 0;
+ ktime_t now;
/*
* Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
* We are called with preemtion disabled from the depth of the
* idle code, so we can't be moved away.
*/
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
+ raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
- if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (state == TICK_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
}
}
out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this CPU: */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
/*
* Remove a dead CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
{
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
- broadcast_move_bc(cpu);
-
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
/*
* Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state != CLOCK_EVT_STATE_ONESHOT)
return;
for (;;) {
/*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td,
void tick_install_replacement(struct clock_event_device *newdev)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
int cpu = smp_processor_id();
clockevents_exchange_device(td->evtdev, newdev);
@@ -332,14 +332,16 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
{
- if (*cpup == tick_do_timer_cpu) {
+ if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
* access the hardware device itself.
* We just set the mode and remove it from the lists.
*/
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
+ dev->state = CLOCK_EVT_STATE_DETACHED;
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
}
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
clockevents_shutdown(td->evtdev);
}
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
- int broadcast = tick_resume_broadcast();
-
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ bool broadcast = tick_resume_check_broadcast();
+ clockevents_tick_resume(td->evtdev);
if (!broadcast) {
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(td->evtdev, 0);
@@ -395,9 +412,87 @@ void tick_resume(void)
}
/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+ tick_suspend_local();
+ tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+ tick_resume_broadcast();
+ tick_resume_local();
+}
+
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping. Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ tick_freeze_depth++;
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_suspend();
+ else
+ tick_suspend_local();
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping. Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_resume();
+ else
+ tick_resume_local();
+
+ tick_freeze_depth--;
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
* tick_init - initialize the tick control
*/
void __init tick_init(void)
{
tick_broadcast_init();
+ tick_nohz_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7ab92b19965a..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,14 +4,13 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
-extern seqlock_t jiffies_lock;
+#include "timekeeping.h"
+#include "tick-sched.h"
-#define CS_NAME_LEN 32
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-
-#define TICK_DO_TIMER_NONE -1
-#define TICK_DO_TIMER_BOOT -2
+# define TICK_DO_TIMER_NONE -1
+# define TICK_DO_TIMER_BOOT -2
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
@@ -21,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
-extern void tick_shutdown(unsigned int *cpup);
+extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev);
extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+ ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
- * NO_HZ / high resolution timer shared code
- */
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
#ifdef CONFIG_TICK_ONESHOT
extern void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -44,120 +94,46 @@ extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
-# endif /* !BROADCAST */
-
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
- void (*handler)(struct clock_event_device *),
- ktime_t nextevt)
-{
- BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
- BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
- return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
- return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
-
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-
-#else /* !BROADCAST */
-
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
- return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
- int cpu)
-{
- return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
- u32 freq) { return -ENODEV; }
-
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
- int broadcast)
-{
- dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
- return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
+
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
#endif
-
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a33..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
- struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
/*
* Per cpu nohz control structure
*/
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static bool can_stop_full_tick(void)
@@ -204,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
*/
void __tick_nohz_full_check(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (tick_nohz_full_cpu(smp_processor_id())) {
if (ts->tick_stopped && !is_idle_task(current)) {
@@ -224,13 +225,29 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
};
/*
- * Kick the current CPU if it's full dynticks in order to force it to
+ * Kick this CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
*/
void tick_nohz_full_kick(void)
{
- if (tick_nohz_full_cpu(smp_processor_id()))
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ return;
+
+ irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
+}
+
+/*
+ * Kick the CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_cpu(int cpu)
+{
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
static void nohz_full_kick_ipi(void *info)
@@ -278,19 +295,12 @@ out:
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
- int cpu;
-
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
-
- cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
- }
tick_nohz_full_running = true;
return 1;
@@ -316,25 +326,17 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-/*
- * Worst case string length in chunks of CPU range seems 2 steps
- * separations: 0,2,4,6,...
- * This is NR_CPUS + sizeof('\0')
- */
-static char __initdata nohz_full_buf[NR_CPUS + 1];
-
static int tick_nohz_init_all(void)
{
int err = -1;
#ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+ WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
cpumask_setall(tick_nohz_full_mask);
- cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
tick_nohz_full_running = true;
#endif
return err;
@@ -349,12 +351,43 @@ void __init tick_nohz_init(void)
return;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ cpumask_clear(tick_nohz_full_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ /*
+ * Full dynticks uses irq work to drive the tick rescheduling on safe
+ * locking contexts. But then we need irq work to raise its own
+ * interrupts to avoid circular dependency on the tick
+ */
+ if (!arch_irq_work_has_interrupt()) {
+ pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+ "support irq work self-IPIs\n");
+ cpumask_clear(tick_nohz_full_mask);
+ cpumask_copy(housekeeping_mask, cpu_possible_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
+
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
cpu_notifier(tick_nohz_cpu_down_callback, 0);
- cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
- pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+ pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
+ cpumask_pr_args(tick_nohz_full_mask));
}
#endif
@@ -383,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+ return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
@@ -533,7 +571,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
ktime_t last_update, expires, ret = { .tv64 = 0 };
unsigned long rcu_delta_jiffies;
- struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 time_delta;
time_delta = timekeeping_max_deferment();
@@ -545,8 +583,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
last_jiffies = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
- arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
+ if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ arch_needs_cpu() || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
@@ -801,13 +839,12 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
- ts = &__get_cpu_var(tick_cpu_sched);
+ ts = this_cpu_ptr(&tick_cpu_sched);
ts->inidle = 1;
__tick_nohz_idle_enter(ts);
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -819,7 +856,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
*/
void tick_nohz_irq_exit(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
__tick_nohz_idle_enter(ts);
@@ -834,7 +871,7 @@ void tick_nohz_irq_exit(void)
*/
ktime_t tick_nohz_get_sleep_length(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->sleep_length;
}
@@ -912,7 +949,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
*/
void tick_nohz_idle_exit(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
local_irq_disable();
@@ -934,7 +971,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
@@ -947,7 +983,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
@@ -956,6 +992,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -967,7 +1007,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
*/
static void tick_nohz_switch_to_nohz(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
@@ -1029,7 +1069,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
static inline void tick_nohz_irq_enter(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
@@ -1083,6 +1123,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
if (regs)
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
@@ -1103,7 +1147,7 @@ early_param("skew_tick", skew_tick);
*/
void tick_setup_sched_timer(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
/*
@@ -1172,7 +1216,7 @@ void tick_clock_notify(void)
*/
void tick_oneshot_notify(void)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
set_bit(0, &ts->check_clocks);
}
@@ -1187,7 +1231,7 @@ void tick_oneshot_notify(void)
*/
int tick_check_oneshot_change(int allow_nohz)
{
- struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+ TICKDEV_MODE_PERIODIC,
+ TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+ struct clock_event_device *evtdev;
+ enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+ NOHZ_MODE_INACTIVE,
+ NOHZ_MODE_LOWRES,
+ NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+ struct hrtimer sched_timer;
+ unsigned long check_clocks;
+ enum tick_nohz_mode nohz_mode;
+ ktime_t last_tick;
+ int inidle;
+ int tick_stopped;
+ unsigned long idle_jiffies;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+ int idle_active;
+ ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+ ktime_t sleep_length;
+ unsigned long last_jiffies;
+ unsigned long next_jiffies;
+ ktime_t idle_expires;
+ int do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#endif
diff --git a/kernel/time.c b/kernel/time/time.c
index 7c7964c33ae7..2c85b7724af4 100644
--- a/kernel/time.c
+++ b/kernel/time/time.c
@@ -42,6 +42,7 @@
#include <asm/unistd.h>
#include "timeconst.h"
+#include "timekeeping.h"
/*
* The timezone where the local system is located. Used as a default by some
@@ -195,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
if (tv) {
if (copy_from_user(&user_tv, tv, sizeof(*tv)))
return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
@@ -303,7 +308,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+/*
+ * mktime64 - Converts date to seconds.
+ * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
*
@@ -313,15 +320,10 @@ EXPORT_SYMBOL(timespec_trunc);
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines where long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
*/
-unsigned long
-mktime(const unsigned int year0, const unsigned int mon0,
- const unsigned int day, const unsigned int hour,
- const unsigned int min, const unsigned int sec)
+time64_t mktime64(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
{
unsigned int mon = mon0, year = year0;
@@ -331,15 +333,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
year -= 1;
}
- return ((((unsigned long)
+ return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
)*24 + hour /* now have hours */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
-
-EXPORT_SYMBOL(mktime);
+EXPORT_SYMBOL(mktime64);
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
+#if BITS_PER_LONG == 32
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec64);
+
+/**
+ * ns_to_timespec64 - Convert nanoseconds to timespec64
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timespec64 representation of the nsec parameter.
+ */
+struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+ struct timespec64 ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec64) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec64);
+#endif
/*
* When we convert to jiffies then we interpret incoming values
* the following way:
@@ -496,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ * Note that due to the small error in the multiplier here, this
+ * rounding is incorrect for sufficiently large values of tv_nsec, but
+ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
+ * OK.
*
* Rather, we just shift the bits off the right.
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
*/
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- unsigned long sec = value->tv_sec;
- long nsec = value->tv_nsec + TICK_NSEC - 1;
+ nsec = nsec + TICK_NSEC - 1;
if (sec >= MAX_SEC_IN_JIFFIES){
sec = MAX_SEC_IN_JIFFIES;
@@ -517,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value)
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
+
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+ return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+
EXPORT_SYMBOL(timespec_to_jiffies);
void
@@ -533,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
}
EXPORT_SYMBOL(jiffies_to_timespec);
-/* Same for "timeval"
+/*
+ * We could use a similar algorithm to timespec_to_jiffies (with a
+ * different multiplier for usec instead of nsec). But this has a
+ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
+ * usec value, since it's not necessarily integral.
*
- * Well, almost. The problem here is that the real system resolution is
- * in nanoseconds and the value being converted is in micro seconds.
- * Also for some machines (those that use HZ = 1024, in-particular),
- * there is a LARGE error in the tick size in microseconds.
-
- * The solution we use is to do the rounding AFTER we convert the
- * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
- * Instruction wise, this should cost only an additional add with carry
- * instruction above the way it was done above.
+ * We could instead round in the intermediate scaled representation
+ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
+ * perilous: the scaling introduces a small positive error, which
+ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
+ * units to the intermediate before shifting) leads to accidental
+ * overflow and overestimates.
+ *
+ * At the cost of one additional multiplication by a constant, just
+ * use the timespec implementation.
*/
unsigned long
timeval_to_jiffies(const struct timeval *value)
{
- unsigned long sec = value->tv_sec;
- long usec = value->tv_usec;
-
- if (sec >= MAX_SEC_IN_JIFFIES){
- sec = MAX_SEC_IN_JIFFIES;
- usec = 0;
- }
- return (((u64)sec * SEC_CONVERSION) +
- (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
- (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+ return __timespec_to_jiffies(value->tv_sec,
+ value->tv_usec * NSEC_PER_USEC);
}
EXPORT_SYMBOL(timeval_to_jiffies);
@@ -676,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n)
return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
+EXPORT_SYMBOL(nsecs_to_jiffies64);
/**
* nsecs_to_jiffies - Convert nsecs in u64 to jiffies
@@ -694,6 +764,7 @@ unsigned long nsecs_to_jiffies(u64 n)
{
return (unsigned long)nsecs_to_jiffies64(n);
}
+EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
/*
* Add two timespec values and do a safety check for overflow.
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..511bdf2cafda 100644
--- a/kernel/timeconst.bc
+++ b/kernel/time/timeconst.bc
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+ tc->mask = (1ULL << cc->shift) - 1;
+ tc->frac = 0;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+ tc->mask, &tc->frac);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+ cycle_t cycles, u64 mask, u64 frac)
+{
+ u64 ns = (u64) cycles;
+
+ ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+ return ns;
+}
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec = tc->nsec, frac = tc->frac;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (delta > tc->cc->mask / 2) {
+ delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
+ } else {
+ nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 32d8d6aaedb8..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,63 +32,200 @@
#define TK_MIRROR (1 << 1)
#define TK_CLOCK_WAS_SET (1 << 2)
-static struct timekeeper timekeeper;
+/*
+ * The most important data for readout fits into a single 64 byte
+ * cache line.
+ */
+static struct {
+ seqcount_t seq;
+ struct timekeeper timekeeper;
+} tk_core ____cacheline_aligned;
+
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
-static seqcount_t timekeeper_seq;
static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq: Sequence counter for protecting updates. The lowest bit
+ * is the index for the tk_read_base array
+ * @base: tk_read_base array. Access is indexed by the lowest bit of
+ * @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+ seqcount_t seq;
+ struct tk_read_base base[2];
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw ____cacheline_aligned;
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
- tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+ while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+ tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
}
-static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+static inline struct timespec64 tk_xtime(struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+ return ts;
+}
+
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
+ tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}
-static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+ tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
}
-static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
- struct timespec tmp;
+ struct timespec64 tmp;
/*
* Verify consistency of: offset_real = -wall_to_monotonic
* before modifying anything
*/
- set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
-tk->wall_to_monotonic.tv_nsec);
- WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+ WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
tk->wall_to_monotonic = wtm;
- set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
- tk->offs_real = timespec_to_ktime(tmp);
+ set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+ tk->offs_real = timespec64_to_ktime(tmp);
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
-static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
+{
+ tk->offs_boot = ktime_add(tk->offs_boot, delta);
+}
+
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+ cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ const char *name = tk->tkr_mono.clock->name;
+
+ if (offset > max_cycles) {
+ printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+ offset, name, max_cycles);
+ printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+ } else {
+ if (offset > (max_cycles >> 1)) {
+ printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+ offset, name, max_cycles >> 1);
+ printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+ }
+ }
+
+ if (timekeeping_underflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_underflow_seen = 0;
+ }
+
+ if (timekeeping_overflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_overflow_seen = 0;
+ }
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
{
- /* Verify consistency before modifying */
- WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+ cycle_t now, last, mask, max, delta;
+ unsigned int seq;
+
+ /*
+ * Since we're called holding a seqlock, the data may shift
+ * under us while we're doing the calculation. This can cause
+ * false positives, since we'd note a problem but throw the
+ * results away. So nest another seqlock here to atomically
+ * grab the points we are checking with.
+ */
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ now = tkr->read(tkr->clock);
+ last = tkr->cycle_last;
+ mask = tkr->mask;
+ max = tkr->clock->max_cycles;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ delta = clocksource_delta(now, last, mask);
+
+ /*
+ * Try to catch underflows by checking if we are seeing small
+ * mask-relative negative values.
+ */
+ if (unlikely((~delta & mask) < (mask >> 3))) {
+ timekeeping_underflow_seen = 1;
+ delta = 0;
+ }
- tk->total_sleep_time = t;
- tk->offs_boot = timespec_to_ktime(t);
+ /* Cap delta value to the max_cycles values to avoid mult overflows */
+ if (unlikely(delta > max)) {
+ timekeeping_overflow_seen = 1;
+ delta = tkr->clock->max_cycles;
+ }
+
+ return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t cycle_now, delta;
+
+ /* read clocksource */
+ cycle_now = tkr->read(tkr->clock);
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+ return delta;
+}
+#endif
/**
* tk_setup_internals - Set up internals to use clocksource clock.
@@ -107,9 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->clock;
- tk->clock = clock;
- tk->cycle_last = clock->cycle_last = clock->read(clock);
+ old_clock = tk->tkr_mono.clock;
+ tk->tkr_mono.clock = clock;
+ tk->tkr_mono.read = clock->read;
+ tk->tkr_mono.mask = clock->mask;
+ tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+ tk->tkr_raw.clock = clock;
+ tk->tkr_raw.read = clock->read;
+ tk->tkr_raw.mask = clock->mask;
+ tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -133,77 +277,236 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->xtime_nsec >>= -shift_change;
+ tk->tkr_mono.xtime_nsec >>= -shift_change;
else
- tk->xtime_nsec <<= shift_change;
+ tk->tkr_mono.xtime_nsec <<= shift_change;
}
- tk->shift = clock->shift;
+ tk->tkr_raw.xtime_nsec = 0;
+
+ tk->tkr_mono.shift = clock->shift;
+ tk->tkr_raw.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->mult = clock->mult;
+ tk->tkr_mono.mult = clock->mult;
+ tk->tkr_raw.mult = clock->mult;
+ tk->ntp_err_mult = 0;
}
/* Timekeeper helper functions. */
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-u32 (*arch_gettimeoffset)(void);
-
-u32 get_arch_timeoffset(void)
-{
- if (likely(arch_gettimeoffset))
- return arch_gettimeoffset();
- return 0;
-}
+static u32 default_arch_gettimeoffset(void) { return 0; }
+u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
#else
-static inline u32 get_arch_timeoffset(void) { return 0; }
+static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_get_ns(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ cycle_t delta;
s64 nsec;
- /* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ delta = timekeeping_get_delta(tkr);
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- nsec = cycle_delta * tk->mult + tk->xtime_nsec;
- nsec >>= tk->shift;
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
/* If arch requires, add in get_arch_timeoffset() */
- return nsec + get_arch_timeoffset();
+ return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tkr: Timekeeping readout base from which we take the update
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb(); <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tkr);
+ * smp_wmb(); <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tkr);
+ *
+ * The reader side does:
+ *
+ * do {
+ * seq = tkf->seq;
+ * smp_rmb();
+ * idx = seq & 0x01;
+ * now = now(tkf->base[idx]);
+ * smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
- s64 nsec;
+ struct tk_read_base *base = tkf->base;
- /* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ /* Force readers off to base[1] */
+ raw_write_seqcount_latch(&tkf->seq);
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ /* Update base[0] */
+ memcpy(base, tkr, sizeof(*base));
- /* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ /* Force readers back to base[0] */
+ raw_write_seqcount_latch(&tkf->seq);
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + get_arch_timeoffset();
+ /* Update base[1] */
+ memcpy(base + 1, base, sizeof(*base));
+}
+
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ * now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * | o n
+ * | o n
+ * | u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
+}
+
+u64 ktime_get_mono_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_mono);
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+
+u64 ktime_get_raw_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended. It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+ static struct tk_read_base tkr_dummy;
+ struct tk_read_base *tkr = &tk->tkr_mono;
+
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ cycles_at_suspend = tkr->read(tkr->clock);
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+ tkr = &tk->tkr_raw;
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
+}
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+
+static inline void update_vsyscall(struct timekeeper *tk)
+{
+ struct timespec xt, wm;
+
+ xt = timespec64_to_timespec(tk_xtime(tk));
+ wm = timespec64_to_timespec(tk->wall_to_monotonic);
+ update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+ tk->tkr_mono.cycle_last);
+}
+
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -217,7 +520,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
int ret;
@@ -247,6 +550,39 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+/*
+ * Update the ktime_t based scalar nsec members of the timekeeper
+ */
+static inline void tk_update_ktime_data(struct timekeeper *tk)
+{
+ u64 seconds;
+ u32 nsec;
+
+ /*
+ * The xtime based monotonic readout is:
+ * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
+ * The ktime based monotonic readout is:
+ * nsec = base_mono + now();
+ * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
+ */
+ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+ nsec = (u32) tk->wall_to_monotonic.tv_nsec;
+ tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+
+ /* Update the monotonic raw base */
+ tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
+
+ /*
+ * The sum of the nanoseconds portions of xtime and
+ * wall_to_monotonic can be greater/equal one second. Take
+ * this into account before updating tk->ktime_sec.
+ */
+ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+ if (nsec >= NSEC_PER_SEC)
+ seconds++;
+ tk->ktime_sec = seconds;
+}
+
/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
@@ -254,11 +590,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
tk->ntp_error = 0;
ntp_clear();
}
+
+ tk_update_ktime_data(tk);
+
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
if (action & TK_MIRROR)
- memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+ memcpy(&shadow_timekeeper, &tk_core.timekeeper,
+ sizeof(tk_core.timekeeper));
+
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
}
/**
@@ -270,49 +613,49 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
+ cycle_t cycle_now, delta;
s64 nsec;
- clock = tk->clock;
- cycle_now = clock->read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- tk->cycle_last = clock->cycle_last = cycle_now;
+ cycle_now = tk->tkr_mono.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
- tk->xtime_nsec += cycle_delta * tk->mult;
+ tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
+ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- timespec_add_ns(&tk->raw_time, nsec);
+ nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
+ timespec64_add_ns(&tk->raw_time, nsec);
}
/**
- * __getnstimeofday - Returns the time of day in a timespec.
+ * __getnstimeofday64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
* Updates the time of day in the timespec.
* Returns 0 on success, or -ve when suspended (timespec will be undefined).
*/
-int __getnstimeofday(struct timespec *ts)
+int __getnstimeofday64(struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs = 0;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_nsec = 0;
- timespec_add_ns(ts, nsecs);
+ timespec64_add_ns(ts, nsecs);
/*
* Do not bail out early, in case there were callers still using
@@ -322,116 +665,186 @@ int __getnstimeofday(struct timespec *ts)
return -EAGAIN;
return 0;
}
-EXPORT_SYMBOL(__getnstimeofday);
+EXPORT_SYMBOL(__getnstimeofday64);
/**
- * getnstimeofday - Returns the time of day in a timespec.
- * @ts: pointer to the timespec to be set
+ * getnstimeofday64 - Returns the time of day in a timespec64.
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the time of day in a timespec (WARN if suspended).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
-void getnstimeofday(struct timespec *ts)
+void getnstimeofday64(struct timespec64 *ts)
{
- WARN_ON(__getnstimeofday(ts));
+ WARN_ON(__getnstimeofday64(ts));
}
-EXPORT_SYMBOL(getnstimeofday);
+EXPORT_SYMBOL(getnstimeofday64);
ktime_t ktime_get(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
- s64 secs, nsecs;
+ ktime_t base;
+ s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
- /*
- * Use ktime_set/ktime_add_ns to create a proper ktime on
- * 32-bit architectures without CONFIG_KTIME_SCALAR.
- */
- return ktime_add_ns(ktime_set(secs, 0), nsecs);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
+static ktime_t *offsets[TK_OFFS_MAX] = {
+ [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
+ [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
+ [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
+};
+
+ktime_t ktime_get_with_offset(enum tk_offsets offs)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base, *offset = offsets[offs];
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_with_offset);
+
/**
- * ktime_get_ts - get the monotonic clock in timespec format
+ * ktime_mono_to_any() - convert mononotic time to any other time
+ * @tmono: time to convert.
+ * @offs: which offset to use
+ */
+ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
+{
+ ktime_t *offset = offsets[offs];
+ unsigned long seq;
+ ktime_t tconv;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ tconv = ktime_add(tmono, *offset);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return tconv;
+}
+EXPORT_SYMBOL_GPL(ktime_mono_to_any);
+
+/**
+ * ktime_get_raw - Returns the raw monotonic time in ktime_t format
+ */
+ktime_t ktime_get_raw(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ s64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->tkr_raw.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw);
+
+/**
+ * ktime_get_ts64 - get the monotonic clock in timespec64 format
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
+ * in normalized timespec64 format in the variable pointed to by @ts.
*/
-void ktime_get_ts(struct timespec *ts)
+void ktime_get_ts64(struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec tomono;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 tomono;
s64 nsec;
unsigned int seq;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
tomono = tk->wall_to_monotonic;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_sec += tomono.tv_sec;
ts->tv_nsec = 0;
- timespec_add_ns(ts, nsec + tomono.tv_nsec);
+ timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
+EXPORT_SYMBOL_GPL(ktime_get_ts64);
/**
- * timekeeping_clocktai - Returns the TAI time of day in a timespec
- * @ts: pointer to the timespec to be set
+ * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
- * Returns the time of day in a timespec.
+ * Returns the seconds portion of CLOCK_MONOTONIC with a single non
+ * serialized read. tk->ktime_sec is of type 'unsigned long' so this
+ * works on both 32 and 64 bit systems. On 32 bit systems the readout
+ * covers ~136 years of uptime which should be enough to prevent
+ * premature wrap arounds.
*/
-void timekeeping_clocktai(struct timespec *ts)
+time64_t ktime_get_seconds(void)
{
- struct timekeeper *tk = &timekeeper;
- unsigned long seq;
- u64 nsecs;
+ struct timekeeper *tk = &tk_core.timekeeper;
WARN_ON(timekeeping_suspended);
-
- do {
- seq = read_seqcount_begin(&timekeeper_seq);
-
- ts->tv_sec = tk->xtime_sec + tk->tai_offset;
- nsecs = timekeeping_get_ns(tk);
-
- } while (read_seqcount_retry(&timekeeper_seq, seq));
-
- ts->tv_nsec = 0;
- timespec_add_ns(ts, nsecs);
-
+ return tk->ktime_sec;
}
-EXPORT_SYMBOL(timekeeping_clocktai);
-
+EXPORT_SYMBOL_GPL(ktime_get_seconds);
/**
- * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
- * Returns the time of day in a ktime.
+ * Returns the wall clock seconds since 1970. This replaces the
+ * get_seconds() interface which is not y2038 safe on 32bit systems.
+ *
+ * For 64bit systems the fast access to tk->xtime_sec is preserved. On
+ * 32bit systems the access must be protected with the sequence
+ * counter to provide "atomic" access to the 64bit tk->xtime_sec
+ * value.
*/
-ktime_t ktime_get_clocktai(void)
+time64_t ktime_get_real_seconds(void)
{
- struct timespec ts;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ time64_t seconds;
+ unsigned int seq;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ return tk->xtime_sec;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ seconds = tk->xtime_sec;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- timekeeping_clocktai(&ts);
- return timespec_to_ktime(ts);
+ return seconds;
}
-EXPORT_SYMBOL(ktime_get_clocktai);
+EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
#ifdef CONFIG_NTP_PPS
@@ -446,23 +859,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
*/
void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
WARN_ON_ONCE(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- *ts_raw = tk->raw_time;
+ *ts_raw = timespec64_to_timespec(tk->raw_time);
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
- nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(tk);
+ nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
+ nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
@@ -479,45 +892,45 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
*/
void do_gettimeofday(struct timeval *tv)
{
- struct timespec now;
+ struct timespec64 now;
- getnstimeofday(&now);
+ getnstimeofday64(&now);
tv->tv_sec = now.tv_sec;
tv->tv_usec = now.tv_nsec/1000;
}
EXPORT_SYMBOL(do_gettimeofday);
/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
+ * do_settimeofday64 - Sets the time of day.
+ * @ts: pointer to the timespec64 variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(const struct timespec *tv)
+int do_settimeofday64(const struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec ts_delta, xt;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts_delta, xt;
unsigned long flags;
- if (!timespec_valid_strict(tv))
+ if (!timespec64_valid_strict(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+ ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
- tk_set_xtime(tk, tv);
+ tk_set_xtime(tk, ts);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
@@ -525,7 +938,7 @@ int do_settimeofday(const struct timespec *tv)
return 0;
}
-EXPORT_SYMBOL(do_settimeofday);
+EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -535,33 +948,35 @@ EXPORT_SYMBOL(do_settimeofday);
*/
int timekeeping_inject_offset(struct timespec *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec tmp;
+ struct timespec64 ts64, tmp;
int ret = 0;
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
+ ts64 = timespec_to_timespec64(*ts);
+
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/* Make sure the proposed value is valid */
- tmp = timespec_add(tk_xtime(tk), *ts);
- if (!timespec_valid_strict(&tmp)) {
+ tmp = timespec64_add(tk_xtime(tk), ts64);
+ if (!timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
- tk_xtime_add(tk, ts);
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
+ tk_xtime_add(tk, &ts64);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
error: /* even if we error out, we forwarded the time, so call update */
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
@@ -578,14 +993,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
*/
s32 timekeeping_get_tai_offset(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
s32 ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
ret = tk->tai_offset;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -606,14 +1021,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
*/
void timekeeping_set_tai_offset(s32 tai_offset)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
__timekeeping_set_tai_offset(tk, tai_offset);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clock_was_set();
}
@@ -625,14 +1040,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
*/
static int change_clocksource(void *data)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *new, *old;
unsigned long flags;
new = (struct clocksource *) data;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/*
@@ -641,7 +1056,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->clock;
+ old = tk->tkr_mono.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -652,7 +1067,7 @@ static int change_clocksource(void *data)
}
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return 0;
@@ -667,68 +1082,56 @@ static int change_clocksource(void *data)
*/
int timekeeping_notify(struct clocksource *clock)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->clock == clock)
+ if (tk->tkr_mono.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->clock == clock ? 0 : -1;
-}
-
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
- struct timespec now;
-
- getnstimeofday(&now);
-
- return timespec_to_ktime(now);
+ return tk->tkr_mono.clock == clock ? 0 : -1;
}
-EXPORT_SYMBOL_GPL(ktime_get_real);
/**
- * getrawmonotonic - Returns the raw monotonic time in a timespec
- * @ts: pointer to the timespec to be set
+ * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic(struct timespec *ts)
+void getrawmonotonic64(struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- nsecs = timekeeping_get_ns_raw(tk);
- *ts = tk->raw_time;
+ seq = read_seqcount_begin(&tk_core.seq);
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
+ ts64 = tk->raw_time;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- timespec_add_ns(ts, nsecs);
+ timespec64_add_ns(&ts64, nsecs);
+ *ts = ts64;
}
-EXPORT_SYMBOL(getrawmonotonic);
+EXPORT_SYMBOL(getrawmonotonic64);
+
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
int ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -738,16 +1141,16 @@ int timekeeping_valid_for_hres(void)
*/
u64 timekeeping_max_deferment(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
u64 ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->max_idle_ns;
+ ret = tk->tkr_mono.clock->max_idle_ns;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -767,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
/**
* read_boot_clock - Return time of the system start.
*
@@ -782,28 +1193,41 @@ void __weak read_boot_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_boot_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock;
unsigned long flags;
- struct timespec now, boot, tmp;
-
- read_persistent_clock(&now);
+ struct timespec64 now, boot, tmp;
- if (!timespec_valid_strict(&now)) {
+ read_persistent_clock64(&now);
+ if (!timespec64_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
now.tv_nsec = 0;
} else if (now.tv_sec || now.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
- read_boot_clock(&boot);
- if (!timespec_valid_strict(&boot)) {
+ read_boot_clock64(&boot);
+ if (!timespec64_valid_strict(&boot)) {
pr_warn("WARNING: Boot clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
boot.tv_sec = 0;
@@ -811,7 +1235,7 @@ void __init timekeeping_init(void)
}
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
ntp_init();
clock = clocksource_default_clock();
@@ -825,21 +1249,17 @@ void __init timekeeping_init(void)
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
- set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+ set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- tmp.tv_sec = 0;
- tmp.tv_nsec = 0;
- tk_set_sleep_time(tk, tmp);
-
- memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+ timekeeping_update(tk, TK_MIRROR);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
-/* time in seconds when suspend began */
-static struct timespec timekeeping_suspend_time;
+/* time in seconds when suspend began for persistent clock */
+static struct timespec64 timekeeping_suspend_time;
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -849,44 +1269,74 @@ static struct timespec timekeeping_suspend_time;
* adds the sleep offset to the timekeeping variables.
*/
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
- struct timespec *delta)
+ struct timespec64 *delta)
{
- if (!timespec_valid_strict(delta)) {
+ if (!timespec64_valid_strict(delta)) {
printk_deferred(KERN_WARNING
"__timekeeping_inject_sleeptime: Invalid "
"sleep delta value!\n");
return;
}
tk_xtime_add(tk, delta);
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
- tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
+ tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
tk_debug_account_sleep_time(delta);
}
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/**
- * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
- * @delta: pointer to a timespec delta value
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+ return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
*
- * This hook is for architectures that cannot support read_persistent_clock
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+ return persistent_clock_exists;
+}
+
+/**
+ * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec64 delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock64
* because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
*
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
*/
-void timekeeping_inject_sleeptime(struct timespec *delta)
+void timekeeping_inject_sleeptime64(struct timespec64 *delta)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- /*
- * Make sure we don't set the clock twice, as timekeeping_resume()
- * already did it
- */
- if (has_persistent_clock())
- return;
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
@@ -894,36 +1344,33 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
}
+#endif
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
*/
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
{
- struct timekeeper *tk = &timekeeper;
- struct clocksource *clock = tk->clock;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
- struct timespec ts_new, ts_delta;
+ struct timespec64 ts_new, ts_delta;
cycle_t cycle_now, cycle_delta;
- bool suspendtime_found = false;
- read_persistent_clock(&ts_new);
+ sleeptime_injected = false;
+ read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
/*
* After system resumes, we need to calculate the suspended time and
@@ -937,15 +1384,16 @@ static void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = clock->read(clock);
+ cycle_now = tk->tkr_mono.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > clock->cycle_last) {
+ cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -960,40 +1408,40 @@ static void timekeeping_resume(void)
}
nsec += ((u64) cycle_delta * mult) >> shift;
- ts_delta = ns_to_timespec(nsec);
- suspendtime_found = true;
- } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
- ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
- suspendtime_found = true;
+ ts_delta = ns_to_timespec64(nsec);
+ sleeptime_injected = true;
+ } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+ ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
+ sleeptime_injected = true;
}
- if (suspendtime_found)
+ if (sleeptime_injected)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->cycle_last = clock->cycle_last = cycle_now;
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
touch_softlockup_watchdog();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
+ tick_resume();
hrtimers_resume();
}
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec delta, delta_delta;
- static struct timespec old_delta;
+ struct timespec64 delta, delta_delta;
+ static struct timespec64 old_delta;
- read_persistent_clock(&timekeeping_suspend_time);
+ read_persistent_clock64(&timekeeping_suspend_time);
/*
* On some systems the persistent_clock can not be detected at
@@ -1001,38 +1449,41 @@ static int timekeeping_suspend(void)
* value returned, update the persistent_clock_exists flag.
*/
if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
- /*
- * To avoid drift caused by repeated suspend/resumes,
- * which each can add ~1 second drift error,
- * try to compensate so the difference in system time
- * and persistent_clock time stays close to constant.
- */
- delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
- delta_delta = timespec_sub(delta, old_delta);
- if (abs(delta_delta.tv_sec) >= 2) {
+ if (persistent_clock_exists) {
/*
- * if delta_delta is too large, assume time correction
- * has occured and set old_delta to the current delta.
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
*/
- old_delta = delta;
- } else {
- /* Otherwise try to adjust old_system to compensate */
- timekeeping_suspend_time =
- timespec_add(timekeeping_suspend_time, delta_delta);
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occurred and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec64_add(timekeeping_suspend_time, delta_delta);
+ }
}
timekeeping_update(tk, TK_MIRROR);
- write_seqcount_end(&timekeeper_seq);
+ halt_fast_timekeeper(tk);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ tick_suspend();
clocksource_suspend();
clockevents_suspend();
@@ -1050,125 +1501,34 @@ static int __init timekeeping_init_ops(void)
register_syscore_ops(&timekeeping_syscore_ops);
return 0;
}
-
device_initcall(timekeeping_init_ops);
/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
+ * Apply a multiplier adjustment to the timekeeper
*/
-static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
- s64 error, s64 *interval,
- s64 *offset)
+static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
+ s64 offset,
+ bool negative,
+ int adj_scale)
{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adjusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
+ s64 interval = tk->cycle_interval;
+ s32 mult_adj = 1;
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
- tick_error -= tk->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
+ if (negative) {
+ mult_adj = -mult_adj;
+ interval = -interval;
+ offset = -offset;
}
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
-{
- s64 error, interval = tk->cycle_interval;
- int adj;
+ mult_adj <<= adj_scale;
+ interval <<= adj_scale;
+ offset <<= adj_scale;
/*
- * The point of this is to check if the error is greater than half
- * an interval.
- *
- * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
- *
- * Note we subtract one in the shift, so that error is really error*2.
- * This "saves" dividing(shifting) interval twice, but keeps the
- * (error > interval) comparison as still measuring if error is
- * larger than half an interval.
- *
- * Note: It does not "save" on aggravation when reading the code.
- */
- error = tk->ntp_error >> (tk->ntp_error_shift - 1);
- if (error > interval) {
- /*
- * We now divide error by 4(via shift), which checks if
- * the error is greater than twice the interval.
- * If it is greater, we need a bigadjust, if its smaller,
- * we can adjust by 1.
- */
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- } else {
- if (error < -interval) {
- /* See comment above, this is just switched for the negative */
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else {
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- }
- } else {
- goto out_adjust;
- }
- }
-
- if (unlikely(tk->clock->maxadj &&
- (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
- printk_deferred_once(KERN_WARNING
- "Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->clock->name, (long)tk->mult + adj,
- (long)tk->clock->mult + tk->clock->maxadj);
- }
- /*
* So the following can be confusing.
*
- * To keep things simple, lets assume adj == 1 for now.
+ * To keep things simple, lets assume mult_adj == 1 for now.
*
- * When adj != 1, remember that the interval and offset values
+ * When mult_adj != 1, remember that the interval and offset values
* have been appropriately scaled so the math is the same.
*
* The basic idea here is that we're increasing the multiplier
@@ -1212,12 +1572,85 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*
* XXX - TODO: Doc ntp_error calculation.
*/
- tk->mult += adj;
+ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
+ /* NTP adjustment caused clocksource mult overflow */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
- tk->xtime_nsec -= offset;
+ tk->tkr_mono.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+}
+
+/*
+ * Calculate the multiplier adjustment needed to match the frequency
+ * specified by NTP
+ */
+static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
+ s64 offset)
+{
+ s64 interval = tk->cycle_interval;
+ s64 xinterval = tk->xtime_interval;
+ s64 tick_error;
+ bool negative;
+ u32 adj;
+
+ /* Remove any current error adj from freq calculation */
+ if (tk->ntp_err_mult)
+ xinterval -= tk->cycle_interval;
+
+ tk->ntp_tick = ntp_tick_length();
+
+ /* Calculate current error per tick */
+ tick_error = ntp_tick_length() >> tk->ntp_error_shift;
+ tick_error -= (xinterval + tk->xtime_remainder);
+
+ /* Don't worry about correcting it if its small */
+ if (likely((tick_error >= 0) && (tick_error <= interval)))
+ return;
+
+ /* preserve the direction of correction */
+ negative = (tick_error < 0);
+
+ /* Sort out the magnitude of the correction */
+ tick_error = abs(tick_error);
+ for (adj = 0; tick_error > interval; adj++)
+ tick_error >>= 1;
+
+ /* scale the corrections */
+ timekeeping_apply_adjustment(tk, offset, negative, adj);
+}
+
+/*
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
+ */
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+{
+ /* Correct for the current frequency error */
+ timekeeping_freqadjust(tk, offset);
+
+ /* Next make a small adjustment to fix any cumulative error */
+ if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
+ tk->ntp_err_mult = 1;
+ timekeeping_apply_adjustment(tk, offset, 0, 0);
+ } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
+ /* Undo any existing error adjustment */
+ timekeeping_apply_adjustment(tk, offset, 1, 0);
+ tk->ntp_err_mult = 0;
+ }
+
+ if (unlikely(tk->tkr_mono.clock->maxadj &&
+ (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+ > tk->tkr_mono.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+ (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
+ }
-out_adjust:
/*
* It may be possible that when we entered this function, xtime_nsec
* was very small. Further, if we're slightly speeding the clocksource
@@ -1232,12 +1665,11 @@ out_adjust:
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->xtime_nsec < 0)) {
- s64 neg = -(s64)tk->xtime_nsec;
- tk->xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+ tk->tkr_mono.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
-
}
/**
@@ -1250,26 +1682,26 @@ out_adjust:
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
- while (tk->xtime_nsec >= nsecps) {
+ while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
- tk->xtime_nsec -= nsecps;
+ tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
if (unlikely(leap)) {
- struct timespec ts;
+ struct timespec64 ts;
tk->xtime_sec += leap;
ts.tv_sec = leap;
ts.tv_nsec = 0;
tk_set_wall_to_mono(tk,
- timespec_sub(tk->wall_to_monotonic, ts));
+ timespec64_sub(tk->wall_to_monotonic, ts));
__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
@@ -1301,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->cycle_last += interval;
+ tk->tkr_mono.cycle_last += interval;
+ tk->tkr_raw.cycle_last += interval;
- tk->xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1317,48 +1750,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
- tk->ntp_error += ntp_tick_length() << shift;
+ tk->ntp_error += tk->ntp_tick << shift;
tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
(tk->ntp_error_shift + shift);
return offset;
}
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
- s64 remainder;
-
- /*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
- * users are removed, this can be killed.
- */
- remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
- tk->xtime_nsec -= remainder;
- tk->xtime_nsec += 1ULL << tk->shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
-
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time(void)
{
- struct clocksource *clock;
- struct timekeeper *real_tk = &timekeeper;
+ struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
int shift = 0, maxshift;
@@ -1371,18 +1776,20 @@ void update_wall_time(void)
if (unlikely(timekeeping_suspended))
goto out;
- clock = real_tk->clock;
-
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+ offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval)
goto out;
+ /* Do some additional sanity checking */
+ timekeeping_check_update(real_tk, offset);
+
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
@@ -1418,9 +1825,7 @@ void update_wall_time(void)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- write_seqcount_begin(&timekeeper_seq);
- /* Update clock->cycle_last with the new value */
- clock->cycle_last = tk->cycle_last;
+ write_seqcount_begin(&tk_core.seq);
/*
* Update the real timekeeper.
*
@@ -1428,12 +1833,12 @@ void update_wall_time(void)
* requires changes to all other timekeeper usage sites as
* well, i.e. move the timekeeper pointer getter into the
* spinlocked/seqcount protected sections. And we trade this
- * memcpy under the timekeeper_seq against one before we start
+ * memcpy under the tk_core.seq against one before we start
* updating.
*/
memcpy(real_tk, tk, sizeof(*tk));
timekeeping_update(real_tk, clock_set);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (clock_set)
@@ -1442,95 +1847,28 @@ out:
}
/**
- * getboottime - Return the real time of system boot.
- * @ts: pointer to the timespec to be set
+ * getboottime64 - Return the real time of system boot.
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
* basically means that however wrong your real time clock is at boot time,
* you get the right time here).
*/
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec boottime = {
- .tv_sec = tk->wall_to_monotonic.tv_sec +
- tk->total_sleep_time.tv_sec,
- .tv_nsec = tk->wall_to_monotonic.tv_nsec +
- tk->total_sleep_time.tv_nsec
- };
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
- set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
+ *ts = ktime_to_timespec64(t);
}
-EXPORT_SYMBOL_GPL(getboottime);
-
-/**
- * get_monotonic_boottime - Returns monotonic time since boot
- * @ts: pointer to the timespec to be set
- *
- * Returns the monotonic time since boot in a timespec.
- *
- * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
- * includes the time spent in suspend.
- */
-void get_monotonic_boottime(struct timespec *ts)
-{
- struct timekeeper *tk = &timekeeper;
- struct timespec tomono, sleep;
- s64 nsec;
- unsigned int seq;
-
- WARN_ON(timekeeping_suspended);
-
- do {
- seq = read_seqcount_begin(&timekeeper_seq);
- ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
- tomono = tk->wall_to_monotonic;
- sleep = tk->total_sleep_time;
-
- } while (read_seqcount_retry(&timekeeper_seq, seq));
-
- ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
- ts->tv_nsec = 0;
- timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(get_monotonic_boottime);
-
-/**
- * ktime_get_boottime - Returns monotonic time since boot in a ktime
- *
- * Returns the monotonic time since boot in a ktime
- *
- * This is similar to CLOCK_MONTONIC/ktime_get, but also
- * includes the time spent in suspend.
- */
-ktime_t ktime_get_boottime(void)
-{
- struct timespec ts;
-
- get_monotonic_boottime(&ts);
- return timespec_to_ktime(ts);
-}
-EXPORT_SYMBOL_GPL(ktime_get_boottime);
-
-/**
- * monotonic_to_bootbased - Convert the monotonic time to boot based.
- * @ts: pointer to the timespec to be converted
- */
-void monotonic_to_bootbased(struct timespec *ts)
-{
- struct timekeeper *tk = &timekeeper;
-
- *ts = timespec_add(*ts, tk->total_sleep_time);
-}
-EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
+EXPORT_SYMBOL_GPL(getboottime64);
unsigned long get_seconds(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
return tk->xtime_sec;
}
@@ -1538,42 +1876,43 @@ EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
- return tk_xtime(tk);
+ return timespec64_to_timespec(tk_xtime(tk));
}
struct timespec current_kernel_time(void)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec now;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now;
unsigned long seq;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
now = tk_xtime(tk);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- return now;
+ return timespec64_to_timespec(now);
}
EXPORT_SYMBOL(current_kernel_time);
-struct timespec get_monotonic_coarse(void)
+struct timespec64 get_monotonic_coarse64(void)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec now, mono;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now, mono;
unsigned long seq;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
now = tk_xtime(tk);
mono = tk->wall_to_monotonic;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
+
return now;
}
@@ -1587,29 +1926,38 @@ void do_timer(unsigned long ticks)
}
/**
- * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
- * and sleep offsets.
- * @xtim: pointer to timespec to be set with xtime
- * @wtom: pointer to timespec to be set with wall_to_monotonic
- * @sleep: pointer to timespec to be set with time in suspend
+ * ktime_get_update_offsets_tick - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
+ *
+ * Returns monotonic time at last tick and various offsets
*/
-void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
- struct timespec *wtom, struct timespec *sleep)
+ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
{
- struct timekeeper *tk = &timekeeper;
- unsigned long seq;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- *xtim = tk_xtime(tk);
- *wtom = tk->wall_to_monotonic;
- *sleep = tk->total_sleep_time;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ base = tk->tkr_mono.base;
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
}
#ifdef CONFIG_HIGH_RES_TIMERS
/**
- * ktime_get_update_offsets - hrtimer helper
+ * ktime_get_update_offsets_now - hrtimer helper
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
@@ -1617,57 +1965,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
* Returns current monotonic time and updates the offsets
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
ktime_t *offs_tai)
{
- struct timekeeper *tk = &timekeeper;
- ktime_t now;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
- u64 secs, nsecs;
+ ktime_t base;
+ u64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- secs = tk->xtime_sec;
- nsecs = timekeeping_get_ns(tk);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
*offs_tai = tk->offs_tai;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- now = ktime_add_ns(ktime_set(secs, 0), nsecs);
- now = ktime_sub(now, *offs_real);
- return now;
+ return ktime_add_ns(base, nsecs);
}
#endif
/**
- * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
- */
-ktime_t ktime_get_monotonic_offset(void)
-{
- struct timekeeper *tk = &timekeeper;
- unsigned long seq;
- struct timespec wtom;
-
- do {
- seq = read_seqcount_begin(&timekeeper_seq);
- wtom = tk->wall_to_monotonic;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
-
- return timespec_to_ktime(wtom);
-}
-EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
-
-/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex(struct timex *txc)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec ts;
+ struct timespec64 ts;
s32 orig_tai, tai;
int ret;
@@ -1687,10 +2015,10 @@ int do_adjtimex(struct timex *txc)
return ret;
}
- getnstimeofday(&ts);
+ getnstimeofday64(&ts);
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
orig_tai = tai = tk->tai_offset;
ret = __do_adjtimex(txc, &ts, &tai);
@@ -1699,7 +2027,7 @@ int do_adjtimex(struct timex *txc)
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
}
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (tai != orig_tai)
@@ -1719,11 +2047,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
__hardpps(phase_ts, raw_ts);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..ead8794b9a4e
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,29 @@
+#ifndef _KERNEL_TIME_TIMEKEEPING_H
+#define _KERNEL_TIME_TIMEKEEPING_H
+/*
+ * Internal interfaces for kernel/time/
+ */
+extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+
+extern int timekeeping_valid_for_hres(void);
+extern u64 timekeeping_max_deferment(void);
+extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
+
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
+
+#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 4d54f97558df..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
}
late_initcall(tk_debug_sleep_time_init);
-void tk_debug_account_sleep_time(struct timespec *t)
+void tk_debug_account_sleep_time(struct timespec64 *t)
{
sleep_time_bin[fls(t->tv_sec)]++;
}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
/*
* timekeeping debug functions
*/
+#include <linux/clocksource.h>
#include <linux/time.h>
#ifdef CONFIG_DEBUG_FS
-extern void tk_debug_account_sleep_time(struct timespec *t);
+extern void tk_debug_account_sleep_time(struct timespec64 *t);
#else
#define tk_debug_account_sleep_time(x)
#endif
+#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ cycle_t ret = (now - last) & mask;
+
+ return (s64) ret > 0 ? ret : 0;
+}
+#else
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ return (now - last) & mask;
+}
+#endif
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/time/timer.c
index 3bb01a323b2a..2ece3aa5069c 100644
--- a/kernel/timer.c
+++ b/kernel/time/timer.c
@@ -82,6 +82,7 @@ struct tvec_base {
unsigned long next_timer;
unsigned long active_timers;
unsigned long all_timers;
+ int cpu;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -89,8 +90,18 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
+
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
@@ -409,6 +420,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
base->next_timer = timer->expires;
}
base->all_timers++;
+
+ /*
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
+ *
+ * Spare the IPI for deferrable timers on idle targets though.
+ * The next busy ticks will take care of it. Except full dynticks
+ * require special care against races with idle_cpu(), lets deal
+ * with that later.
+ */
+ if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
}
#ifdef CONFIG_TIMER_STATS
@@ -638,7 +665,7 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+ struct tvec_base *base = raw_cpu_read(tvec_bases);
timer->entry.next = NULL;
timer->base = (void *)((unsigned long)base | flags);
@@ -948,22 +975,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
timer_set_base(timer, base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
- /*
- * Check whether the other CPU is in dynticks mode and needs
- * to be triggered to reevaluate the timer wheel.
- * We are protected against the other CPU fiddling
- * with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to stop its tick can not
- * evaluate the timer wheel.
- *
- * Spare the IPI for deferrable timers on idle targets though.
- * The next busy ticks will take care of it. Except full dynticks
- * require special care against races with idle_cpu(), lets deal
- * with that later.
- */
- if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
- wake_up_nohz_cpu(cpu);
-
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1026,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
+
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1376,15 +1389,14 @@ unsigned long get_next_timer_interrupt(unsigned long now)
void update_process_times(int user_tick)
{
struct task_struct *p = current;
- int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(cpu, user_tick);
+ rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
- irq_work_run();
+ irq_work_tick();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
@@ -1532,63 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
- int j;
- struct tvec_base *base;
- static char tvec_base_done[NR_CPUS];
-
- if (!tvec_base_done[cpu]) {
- static char boot_done;
-
- if (boot_done) {
- /*
- * The APs use this path later in boot
- */
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
- if (!base)
- return -ENOMEM;
-
- /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
- if (WARN_ON(base != tbase_get_base(base))) {
- kfree(base);
- return -ENOMEM;
- }
- per_cpu(tvec_bases, cpu) = base;
- } else {
- /*
- * This is for the boot CPU - we use compile-time
- * static initialisation because per-cpu memory isn't
- * ready yet and because the memory allocators are not
- * initialised either.
- */
- boot_done = 1;
- base = &boot_tvec_bases;
- }
- spin_lock_init(&base->lock);
- tvec_base_done[cpu] = 1;
- } else {
- base = per_cpu(tvec_bases, cpu);
- }
-
-
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
- base->active_timers = 0;
- base->all_timers = 0;
- return 0;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
{
@@ -1630,55 +1585,86 @@ static void migrate_timers(int cpu)
migrate_timer_list(new_base, old_base->tv5.vec + i);
}
+ old_base->active_timers = 0;
+ old_base->all_timers = 0;
+
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
put_cpu_var(tvec_bases);
}
-#endif /* CONFIG_HOTPLUG_CPU */
static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- long cpu = (long)hcpu;
- int err;
-
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = init_timers_cpu(cpu);
- if (err < 0)
- return notifier_from_errno(err);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
+ switch (action) {
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- migrate_timers(cpu);
+ migrate_timers((long)hcpu);
break;
-#endif
default:
break;
}
+
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
- .notifier_call = timer_cpu_notify,
-};
+static inline void timer_register_cpu_notifier(void)
+{
+ cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+ int j;
-void __init init_timers(void)
+ BUG_ON(base != tbase_get_base(base));
+
+ base->cpu = cpu;
+ per_cpu(tvec_bases, cpu) = base;
+ spin_lock_init(&base->lock);
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD(base->tv5.vec + j);
+ INIT_LIST_HEAD(base->tv4.vec + j);
+ INIT_LIST_HEAD(base->tv3.vec + j);
+ INIT_LIST_HEAD(base->tv2.vec + j);
+ }
+ for (j = 0; j < TVR_SIZE; j++)
+ INIT_LIST_HEAD(base->tv1.vec + j);
+
+ base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
+}
+
+static void __init init_timer_cpus(void)
{
- int err;
+ struct tvec_base *base;
+ int local_cpu = smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == local_cpu)
+ base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+ else
+ base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+ init_timer_cpu(base, cpu);
+ }
+}
+
+void __init init_timers(void)
+{
/* ensure there are enough low bits for flags in timer->base pointer */
BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
- err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- BUG_ON(err != NOTIFY_OK);
-
+ init_timer_cpus();
init_timer_stats();
- register_cpu_notifier(&timers_nb);
+ timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <linux/tick.h>
#include <asm/uaccess.h>
+#include "tick-internal.h"
struct timer_list_iter {
int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_next_event);
SEQ_printf(m, "\n");
- SEQ_printf(m, " set_mode: ");
- print_name_offset(m, dev->set_mode);
- SEQ_printf(m, "\n");
+ if (dev->set_mode) {
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+ } else {
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
+ }
+ }
SEQ_printf(m, " event_handler: ");
print_name_offset(m, dev->event_handler);
diff --git a/kernel/torture.c b/kernel/torture.c
index 40bb511cca48..dd70993c266c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
/*
* Print online/offline testing statistics.
*/
-char *torture_onoff_stats(char *page)
+void torture_onoff_stats(void)
{
#ifdef CONFIG_HOTPLUG_CPU
- page += sprintf(page,
- "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
- n_online_successes, n_online_attempts,
- n_offline_successes, n_offline_attempts,
- min_online, max_online,
- min_offline, max_offline,
- sum_online, sum_offline, HZ);
+ pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
- return page;
}
EXPORT_SYMBOL_GPL(torture_onoff_stats);
@@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
*
* This must be called before the caller starts shutting down its own
* kthreads.
+ *
+ * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
+ * in order to correctly perform the cleanup. They are separated because
+ * threads can still need to reference the torture_type type, thus nullify
+ * only after completing all other relevant calls.
*/
-bool torture_cleanup(void)
+bool torture_cleanup_begin(void)
{
mutex_lock(&fullstop_mutex);
if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
@@ -651,12 +654,17 @@ bool torture_cleanup(void)
torture_shuffle_cleanup();
torture_stutter_cleanup();
torture_onoff_cleanup();
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup_begin);
+
+void torture_cleanup_end(void)
+{
mutex_lock(&fullstop_mutex);
torture_type = NULL;
mutex_unlock(&fullstop_mutex);
- return false;
}
-EXPORT_SYMBOL_GPL(torture_cleanup);
+EXPORT_SYMBOL_GPL(torture_cleanup_end);
/*
* Is it time for the current torture test to stop?
@@ -708,7 +716,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
int ret = 0;
VERBOSE_TOROUT_STRING(m);
- *tp = kthread_run(fn, arg, s);
+ *tp = kthread_run(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
VERBOSE_TOROUT_ERRSTRING(f);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..3b9a48ae153a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_TRACE_MCOUNT_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -437,6 +432,14 @@ config UPROBE_EVENT
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on KPROBE_EVENT
+ bool
+ default y
+ help
+ This allows the user to attach BPF programs to kprobe events.
+
config PROBE_EVENTS
def_bool n
@@ -604,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config TRACE_ENUM_MAP_FILE
+ bool "Show enum mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum names instead
+ of their values. This can cause problems for user space tools that
+ use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert the enum into its value. If this macro is used, then the
+ print fmt strings will have the enums converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums the kernel tried to convert.
+
+ This option is for debugging the enum conversions. A file is created
+ in the tracing directory called "enum_map" that will show the enum
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "enum_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
-CFLAGS_trace_selftest_dynamic.o = -pg
+CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
obj-y += trace_selftest_dynamic.o
endif
endif
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
@@ -52,9 +53,10 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
+ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
endif
ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
-static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq,
- "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
- MAJOR(t->device), MINOR(t->device), iter->cpu,
- secs, nsec_rem, iter->ent->pid, act, rwbs);
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static int blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
}
-static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
{
const unsigned char *pdu_buf;
int pdu_len;
- int i, end, ret;
+ int i, end;
pdu_buf = pdu_start(ent);
pdu_len = te_blk_io_trace(ent)->pdu_len;
if (!pdu_len)
- return 1;
+ return;
/* find the last zero that needs to be printed */
for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
break;
end++;
- if (!trace_seq_putc(s, '('))
- return 0;
+ trace_seq_putc(s, '(');
for (i = 0; i < pdu_len; i++) {
- ret = trace_seq_printf(s, "%s%02x",
- i == 0 ? "" : " ", pdu_buf[i]);
- if (!ret)
- return ret;
+ trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
/*
* stop when the rest is just zeroes and indicate so
* with a ".." appended
*/
- if (i == end && end != pdu_len - 1)
- return trace_seq_puts(s, " ..) ");
+ if (i == end && end != pdu_len - 1) {
+ trace_seq_puts(s, " ..) ");
+ return;
+ }
}
- return trace_seq_puts(s, ") ");
+ trace_seq_puts(s, ") ");
}
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = trace_seq_printf(s, "%u ", t_bytes(ent));
- if (!ret)
- return 0;
- ret = blk_log_dump_pdu(s, ent);
- if (!ret)
- return 0;
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "%u ", t_bytes(ent));
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%s]\n",
+ trace_seq_printf(s, "%llu + %u [%s]\n",
t_sector(ent), t_sec(ent), cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ else
+ trace_seq_printf(s, "[%s]\n", cmd);
}
}
-static int blk_log_with_error(struct trace_seq *s,
+static void blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = blk_log_dump_pdu(s, ent);
- if (ret)
- return trace_seq_printf(s, "[%d]\n", t_error(ent));
- return 0;
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%d]\n",
- t_sector(ent),
- t_sec(ent), t_error(ent));
- return trace_seq_printf(s, "%llu [%d]\n",
- t_sector(ent), t_error(ent));
+ trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ else
+ trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
}
}
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
{
struct blk_io_trace_remap r = { .device_from = 0, };
get_pdu_remap(ent, &r);
- return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
- t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
}
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "[%s]\n", cmd);
}
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
}
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
}
-static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
{
- int ret;
const struct blk_io_trace *t = te_blk_io_trace(ent);
- ret = trace_seq_putmem(s, t + 1, t->pdu_len);
- if (ret)
- return trace_seq_putc(s, '\n');
- return ret;
+ trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putc(s, '\n');
}
/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- int (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
- int ret;
bool long_act;
blk_log_action_t *log_action;
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
- ret = log_action(iter, long_act ? "message" : "m");
- if (ret)
- ret = blk_log_msg(s, iter->ent);
- goto out;
+ log_action(iter, long_act ? "message" : "m");
+ blk_log_msg(s, iter->ent);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
- ret = trace_seq_printf(s, "Unknown action %x\n", what);
+ trace_seq_printf(s, "Unknown action %x\n", what);
else {
- ret = log_action(iter, what2act[what].act[long_act]);
- if (ret)
- ret = what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act]);
+ what2act[what].print(s, iter->ent);
}
-out:
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
return print_one_line(iter, false);
}
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
.time = iter->ts,
};
- if (!trace_seq_putmem(s, &old, offset))
- return 0;
- return trace_seq_putmem(s, &t->sector,
- sizeof(old) - offset + t->pdu_len);
+ trace_seq_putmem(s, &old, offset);
+ trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
}
static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- return blk_trace_synthesize_old_trace(iter) ?
- TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ blk_trace_synthesize_old_trace(iter);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
- spin_lock_irq(&running_trace_lock);
- list_del(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..2d56ce501632
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+ unsigned int ret;
+
+ if (in_nmi()) /* not supported yet */
+ return 1;
+
+ preempt_disable();
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ /*
+ * since some bpf program is already running on this cpu,
+ * don't call into another bpf program (same or different)
+ * and don't send kprobe event into ring-buffer,
+ * so return zero here
+ */
+ ret = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, ctx);
+ rcu_read_unlock();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *dst = (void *) (long) r1;
+ int size = (int) r2;
+ void *unsafe_ptr = (void *) (long) r3;
+
+ return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+ .func = bpf_probe_read,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+ char *fmt = (char *) (long) r1;
+ int mod[3] = {};
+ int fmt_cnt = 0;
+ int i;
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ return -EINVAL;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+ return -EINVAL;
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt_cnt >= 3)
+ return -EINVAL;
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ } else if (fmt[i] == 'p') {
+ mod[fmt_cnt]++;
+ i++;
+ if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+ return -EINVAL;
+ fmt_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ }
+
+ if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ return -EINVAL;
+ fmt_cnt++;
+ }
+
+ return __trace_printk(1/* fake ip will not be printed */, fmt,
+ mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+ mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+ mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+ .func = bpf_trace_printk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_probe_read:
+ return &bpf_probe_read_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+
+ case BPF_FUNC_trace_printk:
+ /*
+ * this program might be calling bpf_trace_printk,
+ * so allocate per-cpu printk buffers
+ */
+ trace_printk_init_buffers();
+
+ return &bpf_trace_printk_proto;
+ default:
+ return NULL;
+ }
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+ .get_func_proto = kprobe_prog_func_proto,
+ .is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_prog_ops,
+ .type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac9d1dad630b..02bece4a99ea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
@@ -65,24 +65,27 @@
#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
#ifdef CONFIG_DYNAMIC_FTRACE
-#define INIT_REGEX_LOCK(opsname) \
- .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#define INIT_OPS_HASH(opsname) \
+ .func_hash = &opsname.local_hash, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#define ASSIGN_OPS_HASH(opsname, val) \
+ .func_hash = val, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
#else
-#define INIT_REGEX_LOCK(opsname)
+#define INIT_OPS_HASH(opsname)
+#define ASSIGN_OPS_HASH(opsname, val)
#endif
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ INIT_OPS_HASH(ftrace_list_end)
};
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
-/* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
-
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
@@ -110,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
static struct ftrace_ops control_ops;
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs);
@@ -143,7 +149,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
- mutex_init(&ops->regex_lock);
+ mutex_init(&ops->local_hash.regex_lock);
+ ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
@@ -242,23 +249,42 @@ static void update_function_graph_func(void);
static inline void update_function_graph_func(void) { }
#endif
+
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ return ftrace_ops_get_func(ops);
+}
+
static void update_ftrace_function(void)
{
ftrace_func_t func;
/*
+ * Prepare the ftrace_ops that the arch callback will use.
+ * If there's only one ftrace_ops registered, the ftrace_ops_list
+ * will point to the ops we want.
+ */
+ set_function_trace_op = ftrace_ops_list;
+
+ /* If there's no ftrace_ops registered, just call the stub function */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ func = ftrace_stub;
+
+ /*
* If we are at the end of the list and this ops is
* recursion safe and not dynamic and the arch supports passing ops,
* then have the mcount trampoline call the function directly.
*/
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list->next == &ftrace_list_end &&
- !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
- (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
- !FTRACE_FORCE_LIST_FUNC)) {
- /* Set the ftrace_ops that the arch callback uses */
- set_function_trace_op = ftrace_ops_list;
- func = ftrace_ops_list->func;
+ } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ func = ftrace_ops_get_list_func(ftrace_ops_list);
+
} else {
/* Just use the default ftrace_ops */
set_function_trace_op = &ftrace_list_end;
@@ -374,6 +400,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
return ret;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
static int __register_ftrace_function(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -403,9 +431,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+ /* The control_ops needs the trampoline update */
+ ops = &control_ops;
} else
add_ftrace_ops(&ftrace_ops_list, ops);
+ ftrace_update_trampoline(ops);
+
if (ftrace_enabled)
update_ftrace_function();
@@ -552,13 +584,13 @@ static int function_stat_cmp(void *p1, void *p2)
static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " Function "
- "Hit Time Avg s^2\n"
- " -------- "
- "--- ---- --- ---\n");
+ seq_puts(m, " Function "
+ "Hit Time Avg s^2\n"
+ " -------- "
+ "--- ---- --- ---\n");
#else
- seq_printf(m, " Function Hit\n"
- " -------- ---\n");
+ seq_puts(m, " Function Hit\n"
+ " -------- ---\n");
#endif
return 0;
}
@@ -585,7 +617,7 @@ static int function_stat_show(struct seq_file *m, void *v)
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " ");
+ seq_puts(m, " ");
avg = rec->time;
do_div(avg, rec->counter);
@@ -902,7 +934,7 @@ static void unregister_ftrace_profiler(void)
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(ftrace_profile_ops)
+ INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
@@ -989,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = {
.stat_show = function_stat_show
};
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
struct dentry *entry;
@@ -1025,23 +1057,37 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
}
}
- entry = debugfs_create_file("function_profile_enabled", 0644,
+ entry = tracefs_create_file("function_profile_enabled", 0644,
d_tracer, NULL, &ftrace_profile_fops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'function_profile_enabled' entry\n");
}
#else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
}
#endif /* CONFIG_FUNCTION_PROFILER */
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_graph_active;
+#else
+# define ftrace_graph_active 0
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
+static struct ftrace_ops *removed_ops;
+
+/*
+ * Set when doing a global update, like enabling all recs or disabling them.
+ * It is not set when just updating a single ftrace_ops.
+ */
+static bool update_all_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1082,13 +1128,51 @@ static const struct ftrace_hash empty_hash = {
#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
static struct ftrace_ops global_ops = {
- .func = ftrace_stub,
- .notrace_hash = EMPTY_HASH,
- .filter_hash = EMPTY_HASH,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
+ .func = ftrace_stub,
+ .local_hash.notrace_hash = EMPTY_HASH,
+ .local_hash.filter_hash = EMPTY_HASH,
+ INIT_OPS_HASH(global_ops)
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED,
};
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ struct ftrace_ops *op;
+ bool ret = false;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they are freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This is to check for dynamically allocated trampolines.
+ * Trampolines that are in kernel text will have
+ * core_kernel_text() return true.
+ */
+ if (op->trampoline && op->trampoline_size)
+ if (addr >= op->trampoline &&
+ addr < op->trampoline + op->trampoline_size) {
+ ret = true;
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -1227,8 +1311,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
- free_ftrace_hash(ops->filter_hash);
- free_ftrace_hash(ops->notrace_hash);
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
}
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
@@ -1289,9 +1373,12 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
}
static void
-ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
-ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash);
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
@@ -1300,29 +1387,23 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_func_entry *entry;
struct hlist_node *tn;
struct hlist_head *hhd;
- struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
int ret;
int i;
- /*
- * Remove the current set, update the hash and add
- * them back.
- */
- ftrace_hash_rec_disable(ops, enable);
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
/*
* If the new source is empty, just free dst and assign it
* the empty_hash.
*/
if (!src->count) {
- free_ftrace_hash_rcu(*dst);
- rcu_assign_pointer(*dst, EMPTY_HASH);
- /* still need to update the function records */
- ret = 0;
- goto out;
+ new_hash = EMPTY_HASH;
+ goto update;
}
/*
@@ -1335,10 +1416,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
if (bits > FTRACE_HASH_MAX_BITS)
bits = FTRACE_HASH_MAX_BITS;
- ret = -ENOMEM;
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- goto out;
+ return -ENOMEM;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1349,20 +1429,43 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
- old_hash = *dst;
- rcu_assign_pointer(*dst, new_hash);
- free_ftrace_hash_rcu(old_hash);
+update:
+ /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+ if (enable) {
+ /* IPMODIFY should be updated only when filter_hash updating */
+ ret = ftrace_hash_ipmodify_update(ops, new_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return ret;
+ }
+ }
- ret = 0;
- out:
/*
- * Enable regardless of ret:
- * On success, we enable the new hash.
- * On failure, we re-enable the original hash.
+ * Remove the current set, update the hash and add
+ * them back.
*/
- ftrace_hash_rec_enable(ops, enable);
+ ftrace_hash_rec_disable_modify(ops, enable);
- return ret;
+ rcu_assign_pointer(*dst, new_hash);
+
+ ftrace_hash_rec_enable_modify(ops, enable);
+
+ return 0;
+}
+
+static bool hash_contains_ip(unsigned long ip,
+ struct ftrace_ops_hash *hash)
+{
+ /*
+ * The function record is a match if it exists in the filter
+ * hash and not in the notrace hash. Note, an emty hash is
+ * considered a match for the filter hash, but an empty
+ * notrace hash is considered not in the notrace hash.
+ */
+ return (ftrace_hash_empty(hash->filter_hash) ||
+ ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ (ftrace_hash_empty(hash->notrace_hash) ||
+ !ftrace_lookup_ip(hash->notrace_hash, ip));
}
/*
@@ -1380,8 +1483,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
static int
ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_ops_hash hash;
int ret;
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1394,13 +1496,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
- notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
+ hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
- if ((ftrace_hash_empty(filter_hash) ||
- ftrace_lookup_ip(filter_hash, ip)) &&
- (ftrace_hash_empty(notrace_hash) ||
- !ftrace_lookup_ip(notrace_hash, ip)))
+ if (hash_contains_ip(ip, &hash))
ret = 1;
else
ret = 0;
@@ -1492,6 +1591,26 @@ int ftrace_text_reserved(const void *start, const void *end)
return (int)!!ret;
}
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ bool keep_regs = false;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(ops, rec->ip, rec)) {
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ keep_regs = true;
+ break;
+ }
+ }
+ }
+
+ return keep_regs;
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1519,14 +1638,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* gets inversed.
*/
if (filter_hash) {
- hash = ops->filter_hash;
- other_hash = ops->notrace_hash;
+ hash = ops->func_hash->filter_hash;
+ other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
all = 1;
} else {
inc = !inc;
- hash = ops->notrace_hash;
- other_hash = ops->filter_hash;
+ hash = ops->func_hash->notrace_hash;
+ other_hash = ops->func_hash->filter_hash;
/*
* If the notrace hash has no items,
* then there's nothing to do.
@@ -1572,8 +1691,25 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (inc) {
rec->flags++;
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return;
+
+ /*
+ * If there's only a single callback registered to a
+ * function, and the ops has a trampoline registered
+ * for it, then we can call it directly.
+ */
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ /*
+ * If we are adding another function callback
+ * to this function, and the previous had a
+ * custom trampoline in use, then we need to go
+ * back to the default trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
/*
* If any ops wants regs saved for this function
* then all ops will get saved regs.
@@ -1581,9 +1717,38 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
rec->flags |= FTRACE_FL_REGS;
} else {
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
return;
rec->flags--;
+
+ /*
+ * If the rec had REGS enabled and the ops that is
+ * being removed had REGS set, then see if there is
+ * still any ops for this record that wants regs.
+ * If not, we can stop recording them.
+ */
+ if (ftrace_rec_count(rec) > 0 &&
+ rec->flags & FTRACE_FL_REGS &&
+ ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ if (!test_rec_ops_needs_regs(rec))
+ rec->flags &= ~FTRACE_FL_REGS;
+ }
+
+ /*
+ * If the rec had TRAMP enabled, then it needs to
+ * be cleared. As TRAMP can only be enabled iff
+ * there is only a single ops attached to it.
+ * In otherwords, always disable it on decrementing.
+ * In the future, we may set it if rec count is
+ * decremented to one, and the ops that is left
+ * has a trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /*
+ * flags will be cleared in ftrace_check_record()
+ * if rec count is zero.
+ */
}
count++;
/* Shortcut, if we handled all records, we are done. */
@@ -1604,6 +1769,149 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
+ int filter_hash, int inc)
+{
+ struct ftrace_ops *op;
+
+ __ftrace_hash_rec_update(ops, filter_hash, inc);
+
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ /*
+ * If the ops shares the global_ops hash, then we need to update
+ * all ops that are enabled and use this hash.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* Already done */
+ if (op == ops)
+ continue;
+ if (op->func_hash == &global_ops.local_hash)
+ __ftrace_hash_rec_update(op, filter_hash, inc);
+ } while_for_each_ftrace_op(op);
+}
+
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+}
+
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ * - If the hash is EMPTY_HASH, it hits nothing
+ * - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec, *end = NULL;
+ int in_old, in_new;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ /*
+ * Since the IPMODIFY is a very address sensitive action, we do not
+ * allow ftrace_ops to set all functions to new hash.
+ */
+ if (!new_hash || !old_hash)
+ return -EINVAL;
+
+ /* Update rec->flags */
+ do_for_each_ftrace_rec(pg, rec) {
+ /* We need to update only differences of filter_hash */
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new) {
+ /* New entries must ensure no others are using it */
+ if (rec->flags & FTRACE_FL_IPMODIFY)
+ goto rollback;
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } else /* Removed entry */
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+ return 0;
+
+rollback:
+ end = rec;
+
+ /* Roll back what we did above */
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec == end)
+ goto err_out;
+
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new)
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ else
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+err_out:
+ return -EBUSY;
+}
+
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(old_hash))
+ old_hash = NULL;
+
+ if (ftrace_hash_empty(new_hash))
+ new_hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1614,10 +1922,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+
/**
* ftrace_bug - report and shutdown function tracer
* @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
+ * @rec: The record that failed
*
* The arch code that enables or disables the function tracing
* can call ftrace_bug() when it has detected a problem in
@@ -1626,8 +1937,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
* EINVAL - if what is read at @ip is not what was expected
* EPERM - if the problem happens on writting to the @ip address
*/
-void ftrace_bug(int failed, unsigned long ip)
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
+ unsigned long ip = rec ? rec->ip : 0;
+
switch (failed) {
case -EFAULT:
FTRACE_WARN_ON_ONCE(1);
@@ -1639,7 +1952,7 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1651,6 +1964,24 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ if (rec) {
+ struct ftrace_ops *ops = NULL;
+
+ pr_info("ftrace record flags: %lx\n", rec->flags);
+ pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ pr_cont("\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ pr_cont("\ttramp: ERROR!");
+
+ }
+ ip = ftrace_get_addr_curr(rec);
+ pr_cont(" expected tramp: %lx\n", ip);
+ }
}
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -1668,17 +1999,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* If we are disabling calls, then disable all records that
* are enabled.
*/
- if (enable && (rec->flags & ~FTRACE_FL_MASK))
+ if (enable && ftrace_rec_count(rec))
flag = FTRACE_FL_ENABLED;
/*
- * If enabling and the REGS flag does not match the REGS_EN, then
- * do not ignore this record. Set flags to fail the compare against
- * ENABLED.
+ * If enabling and the REGS flag does not match the REGS_EN, or
+ * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
+ * this record. Set flags to fail the compare against ENABLED.
*/
- if (flag &&
- (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
- flag |= FTRACE_FL_REGS;
+ if (flag) {
+ if (!(rec->flags & FTRACE_FL_REGS) !=
+ !(rec->flags & FTRACE_FL_REGS_EN))
+ flag |= FTRACE_FL_REGS;
+
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ !(rec->flags & FTRACE_FL_TRAMP_EN))
+ flag |= FTRACE_FL_TRAMP;
+ }
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +2033,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
else
rec->flags &= ~FTRACE_FL_REGS_EN;
}
+ if (flag & FTRACE_FL_TRAMP) {
+ if (rec->flags & FTRACE_FL_TRAMP)
+ rec->flags |= FTRACE_FL_TRAMP_EN;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP_EN;
+ }
}
/*
@@ -1704,7 +2047,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* Otherwise,
* return UPDATE_MODIFY_CALL to tell the caller to convert
* from the save regs, to a non-save regs function or
- * vice versa.
+ * vice versa, or from a trampoline call.
*/
if (flag & FTRACE_FL_ENABLED)
return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,11 +2057,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (update) {
/* If there's no more users, clear all flags */
- if (!(rec->flags & ~FTRACE_FL_MASK))
+ if (!ftrace_rec_count(rec))
rec->flags = 0;
else
- /* Just disable the record (keep REGS state) */
- rec->flags &= ~FTRACE_FL_ENABLED;
+ /*
+ * Just disable the record, but keep the ops TRAMP
+ * and REGS states. The _EN flags must be disabled though.
+ */
+ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
+ FTRACE_FL_REGS_EN);
}
return FTRACE_UPDATE_MAKE_NOP;
@@ -1751,6 +2098,109 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ /*
+ * Need to check removed ops first.
+ * If they are being removed, and this rec has a tramp,
+ * and this rec is in the ops list, then it would be the
+ * one with the tramp.
+ */
+ if (removed_ops) {
+ if (hash_contains_ip(ip, &removed_ops->old_hash))
+ return removed_ops;
+ }
+
+ /*
+ * Need to find the current trampoline for a rec.
+ * Now, a trampoline is only attached to a rec if there
+ * was a single 'ops' attached to it. But this can be called
+ * when we are adding another op to the rec or removing the
+ * current one. Thus, if the op is being added, we can
+ * ignore it because it hasn't attached itself to the rec
+ * yet.
+ *
+ * If an ops is being modified (hooking to different functions)
+ * then we don't care about the new functions that are being
+ * added, just the old ones (that are probably being removed).
+ *
+ * If we are adding an ops to a function that already is using
+ * a trampoline, it needs to be removed (trampolines are only
+ * for single ops connected), then an ops that is not being
+ * modified also needs to be checked.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ /*
+ * If the ops is being added, it hasn't gotten to
+ * the point to be removed from this tree yet.
+ */
+ if (op->flags & FTRACE_OPS_FL_ADDING)
+ continue;
+
+
+ /*
+ * If the ops is being modified and is in the old
+ * hash, then it is probably being removed from this
+ * function.
+ */
+ if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, &op->old_hash))
+ return op;
+ /*
+ * If the ops is not being added or modified, and it's
+ * in its normal filter hash, then this must be the one
+ * we want!
+ */
+ if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, op->func_hash))
+ return op;
+
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* pass rec in as regs to have non-NULL val */
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -1763,6 +2213,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP) {
+ ops = ftrace_find_tramp_ops_new(rec);
+ if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+ pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
+ (void *)rec->ip, (void *)rec->ip, rec->flags);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -1781,6 +2245,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (FTRACE_WARN_ON(!ops)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS_EN)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -1809,7 +2287,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
@@ -1830,7 +2308,7 @@ void __weak ftrace_replace_code(int enable)
do_for_each_ftrace_rec(pg, rec) {
failed = __ftrace_replace_code(rec, enable);
if (failed) {
- ftrace_bug(failed, rec->ip);
+ ftrace_bug(failed, rec);
/* Stop processing */
return;
}
@@ -1912,17 +2390,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
- unsigned long ip;
int ret;
- ip = rec->ip;
-
if (unlikely(ftrace_disabled))
return 0;
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
- ftrace_bug(ret, ip);
+ ftrace_bug(ret, rec);
return 0;
}
return 1;
@@ -2031,11 +2506,6 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine.
- */
- function_trace_stop++;
/*
* By default we use stop_machine() to modify the code.
@@ -2045,15 +2515,28 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- function_trace_stop--;
-
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
}
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
+ struct ftrace_ops_hash *old_hash)
+{
+ ops->flags |= FTRACE_OPS_FL_MODIFYING;
+ ops->old_hash.filter_hash = old_hash->filter_hash;
+ ops->old_hash.notrace_hash = old_hash->notrace_hash;
+ ftrace_run_update_code(command);
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
+}
+
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
-static int global_start_up;
+
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
static void control_ops_free(struct ftrace_ops *ops)
{
@@ -2073,6 +2556,13 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
+static void ftrace_startup_all(int command)
+{
+ update_all_ops = true;
+ ftrace_startup_enable(command);
+ update_all_ops = false;
+}
+
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
int ret;
@@ -2087,12 +2577,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
- ops->flags |= FTRACE_OPS_FL_ENABLED;
+ /*
+ * Note that ftrace probes uses this to start up
+ * and modify functions it will probe. But we still
+ * set the ADDING flag for modification, as probes
+ * do not have trampolines. If they add them in the
+ * future, then the probes will need to distinguish
+ * between adding and updating probes.
+ */
+ ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+
+ ret = ftrace_hash_ipmodify_enable(ops);
+ if (ret < 0) {
+ /* Rollback registration process */
+ __unregister_ftrace_function(ops);
+ ftrace_start_up--;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ return ret;
+ }
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
+ ops->flags &= ~FTRACE_OPS_FL_ADDING;
+
return 0;
}
@@ -2115,10 +2624,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
+ /* Disabling ipmodify never fails */
+ ftrace_hash_ipmodify_disable(ops);
ftrace_hash_rec_disable(ops, 1);
- if (!global_start_up)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -2139,9 +2649,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
return 0;
}
+ /*
+ * If the ops uses a trampoline, then it needs to be
+ * tested first on update.
+ */
+ ops->flags |= FTRACE_OPS_FL_REMOVING;
+ removed_ops = ops;
+
+ /* The trampoline logic checks the old hashes */
+ ops->old_hash.filter_hash = ops->func_hash->filter_hash;
+ ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
+
ftrace_run_update_code(command);
/*
+ * If there's no more ops registered with ftrace, run a
+ * sanity check to make sure all rec flags are cleared.
+ */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (FTRACE_WARN_ON_ONCE(rec->flags))
+ pr_warn(" %pS flags:%lx\n",
+ (void *)rec->ip, rec->flags);
+ } while_for_each_ftrace_rec();
+ }
+
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+
+ removed_ops = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+
+ /*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
* The same goes for freeing the per_cpu data of the control
@@ -2158,6 +2700,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
schedule_on_each_cpu(ftrace_sync);
+ arch_ftrace_trampoline_free(ops);
+
if (ops->flags & FTRACE_OPS_FL_CONTROL)
control_ops_free(ops);
}
@@ -2167,24 +2711,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
static void ftrace_startup_sysctl(void)
{
+ int command;
+
if (unlikely(ftrace_disabled))
return;
/* Force update next time */
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
- if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
}
static void ftrace_shutdown_sysctl(void)
{
+ int command;
+
if (unlikely(ftrace_disabled))
return;
/* ftrace_start_up is true if ftrace is running */
- if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_DISABLE_CALLS);
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
}
static cycle_t ftrace_update_time;
@@ -2196,8 +2752,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
* Filter_hash being empty will default to trace module.
* But notrace hash requires a test of individual module functions.
*/
- return ftrace_hash_empty(ops->filter_hash) &&
- ftrace_hash_empty(ops->notrace_hash);
+ return ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash);
}
/*
@@ -2219,12 +2775,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 0;
/* The function must be in the filter */
- if (!ftrace_hash_empty(ops->filter_hash) &&
- !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
return 0;
return 1;
@@ -2310,7 +2866,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
- ftrace_bug(failed, p->ip);
+ ftrace_bug(failed, p);
}
}
}
@@ -2398,7 +2954,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- while (start_pg) {
+ pg = start_pg;
+ while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
@@ -2544,10 +3101,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
} else {
rec = &iter->pg->records[iter->idx++];
if (((iter->flags & FTRACE_ITER_FILTER) &&
- !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
+ !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
- !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+ !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
!(rec->flags & FTRACE_FL_ENABLED))) {
@@ -2595,8 +3152,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->filter_hash)) {
+ if ((iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->func_hash->filter_hash)) ||
+ (iter->flags & FTRACE_ITER_NOTRACE &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash))) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2632,6 +3191,22 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&ftrace_lock);
}
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return NULL;
+}
+
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ void *ptr;
+
+ ptr = arch_ftrace_trampoline_func(ops, rec);
+ if (ptr)
+ seq_printf(m, " ->%pS", ptr);
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -2641,7 +3216,10 @@ static int t_show(struct seq_file *m, void *v)
return t_hash_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
- seq_printf(m, "#### all functions enabled ####\n");
+ if (iter->flags & FTRACE_ITER_NOTRACE)
+ seq_puts(m, "#### no functions disabled ####\n");
+ else
+ seq_puts(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2651,11 +3229,26 @@ static int t_show(struct seq_file *m, void *v)
return 0;
seq_printf(m, "%ps", (void *)rec->ip);
- if (iter->flags & FTRACE_ITER_ENABLED)
- seq_printf(m, " (%ld)%s",
- rec->flags & ~FTRACE_FL_MASK,
- rec->flags & FTRACE_FL_REGS ? " R" : "");
- seq_printf(m, "\n");
+ if (iter->flags & FTRACE_ITER_ENABLED) {
+ struct ftrace_ops *ops = NULL;
+
+ seq_printf(m, " (%ld)%s%s",
+ ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ seq_printf(m, "\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ seq_puts(m, "\ttramp: ERROR!");
+
+ }
+ add_trampoline_func(m, ops, rec);
+ }
+
+ seq_putc(m, '\n');
return 0;
}
@@ -2689,9 +3282,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
if (iter) {
iter->pg = ftrace_pages_start;
@@ -2702,13 +3292,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return iter ? 0 : -ENOMEM;
}
-static void ftrace_filter_reset(struct ftrace_hash *hash)
-{
- mutex_lock(&ftrace_lock);
- ftrace_hash_clear(hash);
- mutex_unlock(&ftrace_lock);
-}
-
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -2750,15 +3333,21 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (flag & FTRACE_ITER_NOTRACE)
- hash = ops->notrace_hash;
+ hash = ops->func_hash->notrace_hash;
else
- hash = ops->filter_hash;
+ hash = ops->func_hash->filter_hash;
if (file->f_mode & FMODE_WRITE) {
- iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (file->f_flags & O_TRUNC)
+ iter->hash = alloc_ftrace_hash(size_bits);
+ else
+ iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
@@ -2767,10 +3356,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- ftrace_filter_reset(iter->hash);
-
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -2788,7 +3373,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
file->private_data = iter;
out_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
return ret;
}
@@ -3026,12 +3611,12 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
.flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(trace_probe_ops)
+ INIT_OPS_HASH(trace_probe_ops)
};
static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(void)
+static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
{
int ret;
int i;
@@ -3039,7 +3624,8 @@ static void __enable_ftrace_function_probe(void)
if (ftrace_probe_registered) {
/* still need to update the function call sites */
if (ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ old_hash);
return;
}
@@ -3088,8 +3674,10 @@ int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_probe *entry;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3106,9 +3694,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
if (WARN_ON(not))
return -EINVAL;
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
if (!hash) {
count = -ENOMEM;
goto out;
@@ -3167,15 +3759,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
} while_for_each_ftrace_rec();
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
- if (ret < 0)
- count = ret;
- __enable_ftrace_function_probe();
+ __enable_ftrace_function_probe(&old_hash_ops);
+
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+ else
+ count = ret;
out_unlock:
mutex_unlock(&ftrace_lock);
out:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
return count;
@@ -3193,7 +3788,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
@@ -3201,6 +3797,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int type = MATCH_FULL;
int i, len = 0;
char *search;
+ int ret;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
glob = NULL;
@@ -3215,7 +3812,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return;
}
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
@@ -3259,8 +3856,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
- ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
synchronize_sched();
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+
list_for_each_entry_safe(entry, p, &free_list, free_list) {
list_del(&entry->free_list);
ftrace_free_entry(entry);
@@ -3268,7 +3868,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_unlock(&ftrace_lock);
out_unlock:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
}
@@ -3447,10 +4047,35 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
-static void ftrace_ops_update_code(struct ftrace_ops *ops)
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash)
{
- if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ struct ftrace_ops *op;
+
+ if (!ftrace_enabled)
+ return;
+
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+ return;
+ }
+
+ /*
+ * If this is the shared global_ops filter, then we need to
+ * check if there is another ops that shares it, is enabled.
+ * If so, we still need to run the modify code.
+ */
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->func_hash == &global_ops.local_hash &&
+ op->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+ /* Only need to do this once */
+ return;
+ }
+ } while_for_each_ftrace_op(op);
}
static int
@@ -3458,27 +4083,31 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (enable)
- orig_hash = &ops->filter_hash;
+ orig_hash = &ops->func_hash->filter_hash;
else
- orig_hash = &ops->notrace_hash;
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ if (reset)
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ else
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
ret = -ENOMEM;
goto out_regex_unlock;
}
- if (reset)
- ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
ret = -EINVAL;
goto out_regex_unlock;
@@ -3490,14 +4119,18 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret)
- ftrace_ops_update_code(ops);
-
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
out_regex_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
free_ftrace_hash(hash);
return ret;
@@ -3630,8 +4263,12 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
+
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3639,16 +4276,29 @@ static int __init set_graph_function(char *str)
}
__setup("ftrace_graph_filter=", set_graph_function);
-static void __init set_ftrace_early_graph(char *buf)
+static int __init set_graph_notrace_function(char *str)
+{
+ strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+
+static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
+ unsigned long *table = ftrace_graph_funcs;
+ int *count = &ftrace_graph_count;
+
+ if (!enable) {
+ table = ftrace_graph_notrace_funcs;
+ count = &ftrace_graph_notrace_count;
+ }
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3677,15 +4327,19 @@ static void __init set_ftrace_early_filters(void)
ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
- set_ftrace_early_graph(ftrace_graph_buf);
+ set_ftrace_early_graph(ftrace_graph_buf, 1);
+ if (ftrace_graph_notrace_buf[0])
+ set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
int ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct trace_parser *parser;
int filter_hash;
int ret;
@@ -3704,26 +4358,30 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
trace_parser_put(parser);
- mutex_lock(&iter->ops->regex_lock);
+ mutex_lock(&iter->ops->func_hash->regex_lock);
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
if (filter_hash)
- orig_hash = &iter->ops->filter_hash;
+ orig_hash = &iter->ops->func_hash->filter_hash;
else
- orig_hash = &iter->ops->notrace_hash;
+ orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
- if (!ret)
- ftrace_ops_update_code(iter->ops);
-
+ if (!ret) {
+ ftrace_ops_update_code(iter->ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
}
- mutex_unlock(&iter->ops->regex_lock);
+ mutex_unlock(&iter->ops->func_hash->regex_lock);
free_ftrace_hash(iter->hash);
kfree(iter);
@@ -3819,7 +4477,12 @@ static int g_show(struct seq_file *m, void *v)
return 0;
if (ptr == (unsigned long *)1) {
- seq_printf(m, "#### all functions enabled ####\n");
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (fgd->table == ftrace_graph_funcs)
+ seq_puts(m, "#### all functions enabled ####\n");
+ else
+ seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4062,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
mutex_unlock(&ftrace_lock);
}
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
trace_create_file("available_filter_functions", 0444,
@@ -4330,12 +4993,37 @@ void __init ftrace_init(void)
ftrace_disabled = 1;
}
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+ /* Currently, only non dynamic ops can have a trampoline */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ return;
+#endif
+
+ arch_ftrace_update_trampoline(ops);
+}
+
#else
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
};
static int __init ftrace_nodyn_init(void)
@@ -4345,8 +5033,9 @@ static int __init ftrace_nodyn_init(void)
}
core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
+static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
# define ftrace_startup(ops, command) \
({ \
@@ -4372,6 +5061,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 1;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE */
__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -4437,7 +5130,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops control_ops = {
.func = ftrace_ops_control_func,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(control_ops)
+ INIT_OPS_HASH(control_ops)
};
static inline void
@@ -4447,9 +5140,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- if (function_trace_stop)
- return;
-
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
@@ -4461,9 +5151,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip, regs)) {
- if (WARN_ON(!op->func)) {
- function_trace_stop = 1;
- printk("op=%p %pS\n", op, op);
+ if (FTRACE_WARN_ON(!op->func)) {
+ pr_warn("op=%p %pS\n", op, op);
goto out;
}
op->func(ip, parent_ip, op, regs);
@@ -4500,6 +5189,49 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
}
#endif
+/*
+ * If there's only one function registered but it does not support
+ * recursion, this function will be called by the mcount trampoline.
+ * This function will handle recursion protection.
+ */
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
+ return;
+
+ op->func(ip, parent_ip, op, regs);
+
+ trace_clear_recursion(bit);
+}
+
+/**
+ * ftrace_ops_get_func - get the function a trampoline should call
+ * @ops: the ops to get the function for
+ *
+ * Normally the mcount trampoline will call the ops->func, but there
+ * are times that it should not. For example, if the ops does not
+ * have its own recursion protection, then it should call the
+ * ftrace_ops_recurs_func() instead.
+ *
+ * Returns the function that the trampoline should call for @ops.
+ */
+ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
+{
+ /*
+ * If the func handles its own recursion, call it directly.
+ * Otherwise call the recursion protected function that
+ * will call the ftrace ops function.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
+ return ftrace_ops_recurs_func;
+
+ return ops->func;
+}
+
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -4600,7 +5332,8 @@ static int ftrace_pid_add(int p)
set_ftrace_pid_task(pid);
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
return 0;
@@ -4629,7 +5362,7 @@ static void ftrace_pid_reset(void)
}
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
}
@@ -4662,12 +5395,12 @@ static int fpid_show(struct seq_file *m, void *v)
const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
if (v == (void *)1) {
- seq_printf(m, "no pid\n");
+ seq_puts(m, "no pid\n");
return 0;
}
if (fpid->pid == ftrace_swapper_pid)
- seq_printf(m, "swapper tasks\n");
+ seq_puts(m, "swapper tasks\n");
else
seq_printf(m, "%u\n", pid_vnr(fpid->pid));
@@ -4746,24 +5479,24 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
{
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- ftrace_init_dyn_debugfs(d_tracer);
+ ftrace_init_dyn_tracefs(d_tracer);
trace_create_file("set_ftrace_pid", 0644, d_tracer,
NULL, &ftrace_pid_fops);
- ftrace_profile_debugfs(d_tracer);
+ ftrace_profile_tracefs(d_tracer);
return 0;
}
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
@@ -4853,12 +5586,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ftrace_enabled) {
- ftrace_startup_sysctl();
-
/* we are starting ftrace again */
if (ftrace_ops_list != &ftrace_list_end)
update_ftrace_function();
+ ftrace_startup_sysctl();
+
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
ftrace_trace_function = ftrace_stub;
@@ -4873,7 +5606,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int ftrace_graph_active;
+static struct ftrace_ops graph_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+ .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
+#endif
+ ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -5035,12 +5778,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
*/
static void update_function_graph_func(void)
{
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list == &global_ops &&
- global_ops.next == &ftrace_list_end))
- ftrace_graph_entry = __ftrace_graph_entry;
- else
+ struct ftrace_ops *op;
+ bool do_test = false;
+
+ /*
+ * The graph and global ops share the same set of functions
+ * to test. If any other ops is on the list, then
+ * the graph tracing needs to test if its the function
+ * it should call.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op != &global_ops && op != &graph_ops &&
+ op != &ftrace_list_end) {
+ do_test = true;
+ /* in double loop, break out with goto */
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+ out:
+ if (do_test)
ftrace_graph_entry = ftrace_graph_entry_test;
+ else
+ ftrace_graph_entry = __ftrace_graph_entry;
}
static struct notifier_block ftrace_suspend_notifier = {
@@ -5081,11 +5840,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_entry = ftrace_graph_entry_test;
update_function_graph_func();
- /* Function graph doesn't use the .func field of global_ops */
- global_ops.flags |= FTRACE_OPS_FL_STUB;
-
- ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
-
+ ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
return ret;
@@ -5102,11 +5857,21 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
- global_ops.flags &= ~FTRACE_OPS_FL_STUB;
+ ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * Function graph does not allocate the trampoline, but
+ * other global_ops do. We need to reset the ALLOC_TRAMP flag
+ * if one was used.
+ */
+ global_ops.trampoline = save_global_trampoline;
+ if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+ global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
+
out:
mutex_unlock(&ftrace_lock);
}
@@ -5183,9 +5948,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
-
-void ftrace_graph_stop(void)
-{
- ftrace_stop();
-}
#endif
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ff7027199a9a..0315d43176d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
@@ -23,7 +22,6 @@
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>
-#include <linux/fs.h>
#include <asm/local.h>
@@ -34,21 +32,19 @@ static void update_pages_handler(struct work_struct *work);
*/
int ring_buffer_print_entry_header(struct trace_seq *s)
{
- int ret;
-
- ret = trace_seq_puts(s, "# compressed entry header\n");
- ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
- ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
- ret = trace_seq_puts(s, "\tarray : 32 bits\n");
- ret = trace_seq_putc(s, '\n');
- ret = trace_seq_printf(s, "\tpadding : type == %d\n",
- RINGBUF_TYPE_PADDING);
- ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
- RINGBUF_TYPE_TIME_EXTEND);
- ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
- RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ trace_seq_puts(s, "# compressed entry header\n");
+ trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ trace_seq_puts(s, "\tarray : 32 bits\n");
+ trace_seq_putc(s, '\n');
+ trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
/*
@@ -419,38 +415,40 @@ static inline int test_time_stamp(u64 delta)
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
- int ret;
- ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
- return ret;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
}
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
+ wait_queue_head_t full_waiters;
bool waiters_pending;
+ bool full_waiters_pending;
+ bool wakeup_full;
};
/*
@@ -532,31 +530,39 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
+ if (rbwork->wakeup_full) {
+ rbwork->wakeup_full = false;
+ wake_up_all(&rbwork->full_waiters);
+ }
}
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ int ret = 0;
/*
* Depending on what the caller is waiting for, either any
* data in any cpu buffer, or a specific buffer, put the
* caller on the appropriate wait queue.
*/
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ /* Full only makes sense on per cpu reads */
+ full = false;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
@@ -564,36 +570,70 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
}
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ while (true) {
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- work->waiters_pending = true;
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+ break;
+
+ if (cpu != RING_BUFFER_ALL_CPUS &&
+ !ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ break;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ if (!pagebusy)
+ break;
+ }
- if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
schedule();
+ }
- finish_wait(&work->waiters, &wait);
- return 0;
+ if (full)
+ finish_wait(&work->full_waiters, &wait);
+ else
+ finish_wait(&work->waiters, &wait);
+
+ return ret;
}
/**
@@ -626,8 +666,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- work->waiters_pending = true;
poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ /*
+ * There's a tight race between setting the waiters_pending and
+ * checking if the ring buffer is empty. Once the waiters_pending bit
+ * is set, the next event will wake the task up, but we can get stuck
+ * if there's only a single event in.
+ *
+ * FIXME: Ideally, we need a memory barrier on the writer side as well,
+ * but adding a memory barrier to all events will cause too much of a
+ * performance hit in the fast path. We only need a memory barrier when
+ * the buffer goes from empty to having content. But as this race is
+ * extremely small, and it's not a problem if another event comes in, we
+ * will fix it later.
+ */
+ smp_mb();
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
@@ -1192,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
init_completion(&cpu_buffer->update_done);
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1689,22 +1744,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
- /* The update must run on the CPU that is being updated. */
- preempt_disable();
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- preempt_disable();
}
- preempt_enable();
}
/* wait for all the updates to complete */
@@ -1742,22 +1789,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- preempt_disable();
- /* The update must run on the CPU that is being updated. */
- if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- preempt_disable();
}
- preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -1984,7 +2023,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
/**
* rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
* @type: the type of event
* @length: the size of the event field in the ring buffer
*
@@ -2640,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context);
static __always_inline int trace_recursive_lock(void)
{
- unsigned int val = this_cpu_read(current_context);
+ unsigned int val = __this_cpu_read(current_context);
int bit;
if (in_interrupt()) {
@@ -2657,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void)
return 1;
val |= (1 << bit);
- this_cpu_write(current_context, val);
+ __this_cpu_write(current_context, val);
return 0;
}
static __always_inline void trace_recursive_unlock(void)
{
- unsigned int val = this_cpu_read(current_context);
-
- val--;
- val &= this_cpu_read(current_context);
- this_cpu_write(current_context, val);
+ __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
}
#else
@@ -2779,6 +2814,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
+ bool pagebusy;
+
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -2790,6 +2827,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&cpu_buffer->irq_work.work);
}
+
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
}
/**
@@ -3357,21 +3403,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/* Iterator usage is expected to have record disabled */
- if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = rb_set_head_page(cpu_buffer);
- if (unlikely(!iter->head_page))
- return;
- iter->head = iter->head_page->read;
- } else {
- iter->head_page = cpu_buffer->reader_page;
- iter->head = cpu_buffer->reader_page->read;
- }
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+
+ iter->cache_reader_page = iter->head_page;
+ iter->cache_read = cpu_buffer->read;
+
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
- iter->cache_reader_page = cpu_buffer->reader_page;
- iter->cache_read = cpu_buffer->read;
}
/**
@@ -3764,18 +3805,20 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a time extend is encountered.
- * Since the time extend is always attached to a data event,
- * we should never loop more than once.
- * (We never hit the following condition more than twice).
+ * We repeat when a time extend is encountered or we hit
+ * the end of the page. Since the time extend is always attached
+ * to a data event, we should never loop more than three times.
+ * Once for going to next page, once on time extend, and
+ * finally once to get the event.
+ * (We never hit the following condition more than thrice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
- if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ if (iter->head >= rb_page_size(iter->head_page)) {
rb_inc_iter(iter);
goto again;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
#include <linux/completion.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
#include <asm/local.h>
struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
};
/* run time and sleep time in seconds */
-#define RUN_TIME 10
+#define RUN_TIME 10ULL
#define SLEEP_TIME 10
/* number of events for writer to wake up the reader */
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
break;
schedule();
- __set_current_state(TASK_RUNNING);
}
reader_finish = 0;
complete(&read_done);
@@ -213,8 +212,7 @@ static void ring_buffer_consumer(void)
static void ring_buffer_producer(void)
{
- struct timeval start_tv;
- struct timeval end_tv;
+ ktime_t start_time, end_time, timeout;
unsigned long long time;
unsigned long long entries;
unsigned long long overruns;
@@ -228,7 +226,8 @@ static void ring_buffer_producer(void)
* make the system stall)
*/
trace_printk("Starting ring buffer hammer\n");
- do_gettimeofday(&start_tv);
+ start_time = ktime_get();
+ timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
do {
struct ring_buffer_event *event;
int *entry;
@@ -245,7 +244,7 @@ static void ring_buffer_producer(void)
ring_buffer_unlock_commit(buffer, event);
}
}
- do_gettimeofday(&end_tv);
+ end_time = ktime_get();
cnt++;
if (consumer && !(cnt % wakeup_interval))
@@ -265,7 +264,7 @@ static void ring_buffer_producer(void)
cond_resched();
#endif
- } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+ } while (ktime_before(end_time, timeout) && !kill_test);
trace_printk("End ring buffer hammer\n");
if (consumer) {
@@ -281,9 +280,7 @@ static void ring_buffer_producer(void)
wait_for_completion(&read_done);
}
- time = end_tv.tv_sec - start_tv.tv_sec;
- time *= USEC_PER_SEC;
- time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+ time = ktime_us_delta(end_time, start_time);
entries = ring_buffer_entries(buffer);
overruns = ring_buffer_overruns(buffer);
@@ -379,7 +376,6 @@ static int ring_buffer_consumer_thread(void *arg)
break;
schedule();
- __set_current_state(TASK_RUNNING);
}
__set_current_state(TASK_RUNNING);
@@ -407,7 +403,6 @@ static int ring_buffer_producer_thread(void *arg)
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ * SLEEP_TIME);
- __set_current_state(TASK_RUNNING);
}
if (kill_test)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 291397e66669..91eecaaa43e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
@@ -31,6 +32,7 @@
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
+#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@@ -63,6 +65,10 @@ static bool __read_mostly tracing_selftest_running;
*/
bool __read_mostly tracing_selftest_disabled;
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
+
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
{ }
@@ -119,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+ struct module *mod;
+ unsigned long length;
+};
+
+union trace_enum_map_item;
+
+struct trace_enum_map_tail {
+ /*
+ * "end" is first and points to NULL as it must be different
+ * than "mod" or "enum_string"
+ */
+ union trace_enum_map_item *next;
+ const char *end; /* points to NULL */
+};
+
+static DEFINE_MUTEX(trace_enum_mutex);
+
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+ struct trace_enum_map map;
+ struct trace_enum_map_head head;
+ struct trace_enum_map_tail tail;
+};
+
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
#define MAX_TRACER_SIZE 100
@@ -155,10 +197,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
static int __init stop_trace_on_warning(char *str)
{
- __disable_trace_on_warning = 1;
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ __disable_trace_on_warning = 1;
return 1;
}
-__setup("traceoff_on_warning=", stop_trace_on_warning);
+__setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
@@ -192,6 +235,13 @@ static int __init set_trace_boot_clock(char *str)
}
__setup("trace_clock=", set_trace_boot_clock);
+static int __init set_tracepoint_printk(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ tracepoint_printk = 1;
+ return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
unsigned long long ns2usecs(cycle_t nsec)
{
@@ -820,11 +870,12 @@ static struct {
const char *name;
int in_ns; /* is this clock in nanoseconds? */
} trace_clocks[] = {
- { trace_clock_local, "local", 1 },
- { trace_clock_global, "global", 1 },
- { trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 0 },
- { trace_clock, "perf", 1 },
+ { trace_clock_local, "local", 1 },
+ { trace_clock_global, "global", 1 },
+ { trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 0 },
+ { trace_clock, "perf", 1 },
+ { ktime_get_mono_fast_ns, "mono", 1 },
ARCH_TRACE_CLOCKS
};
@@ -937,43 +988,20 @@ out:
return ret;
}
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
-{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
-}
-
+/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (s->len <= s->readpos)
+ if (trace_seq_used(s) <= s->seq.readpos)
return -EBUSY;
- len = s->len - s->readpos;
+ len = trace_seq_used(s) - s->seq.readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->readpos, cnt);
+ memcpy(buf, s->buffer + s->seq.readpos, cnt);
- s->readpos += cnt;
+ s->seq.readpos += cnt;
return cnt;
}
@@ -1099,13 +1127,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ full);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2045,13 +2074,14 @@ void trace_printk_init_buffers(void)
/* trace_printk() is for debug use only. Don't use it in production. */
- pr_warning("\n**********************************************************\n");
+ pr_warning("\n");
+ pr_warning("**********************************************************\n");
pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warning("** **\n");
pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
pr_warning("** **\n");
pr_warning("** This means that this is a DEBUG kernel and it is **\n");
- pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** unsafe for production use. **\n");
pr_warning("** **\n");
pr_warning("** If you see this message and you are not debugging **\n");
pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2180,9 +2210,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
goto out;
}
- len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- if (len > TRACE_BUF_SIZE)
- goto out;
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
@@ -2193,8 +2221,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, tbuffer, len);
- entry->buf[len] = '\0';
+ memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2531,14 +2558,14 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2555,20 +2582,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | | |\n");
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
+ "# | | | | |\n");
}
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n");
- seq_puts(m, "# / _----=> need-resched\n");
- seq_puts(m, "# | / _---=> hardirq/softirq\n");
- seq_puts(m, "# || / _--=> preempt-depth\n");
- seq_puts(m, "# ||| / delay\n");
- seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | |||| | |\n");
+ seq_puts(m, "# _-----=> irqs-off\n"
+ "# / _----=> need-resched\n"
+ "# | / _---=> hardirq/softirq\n"
+ "# || / _--=> preempt-depth\n"
+ "# ||| / delay\n"
+ "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
+ "# | | | |||| | |\n");
}
void
@@ -2671,24 +2698,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- if (!trace_print_lat_context(iter))
- goto partial;
- } else {
- if (!trace_print_context(iter))
- goto partial;
- }
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ trace_print_lat_context(iter);
+ else
+ trace_print_context(iter);
}
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
if (event)
return event->funcs->trace(iter, sym_flags, event);
- if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2699,22 +2723,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (!trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, iter->ts))
- goto partial;
- }
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
event = ftrace_find_event(entry->type);
if (event)
return event->funcs->raw(iter, 0, event);
- if (!trace_seq_printf(s, "%d ?\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "%d ?\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2727,9 +2749,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+ SEQ_PUT_HEX_FIELD(s, entry->pid);
+ SEQ_PUT_HEX_FIELD(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2739,9 +2763,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
return ret;
}
- SEQ_PUT_FIELD_RET(s, newline);
+ SEQ_PUT_FIELD(s, newline);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2753,9 +2777,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
- SEQ_PUT_FIELD_RET(s, iter->ts);
+ SEQ_PUT_FIELD(s, entry->pid);
+ SEQ_PUT_FIELD(s, iter->cpu);
+ SEQ_PUT_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2801,10 +2827,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
- if (iter->lost_events &&
- !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events))
- return TRACE_TYPE_PARTIAL_LINE;
+ if (iter->lost_events) {
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
@@ -2882,44 +2910,44 @@ static void test_ftrace_alive(struct seq_file *m)
{
if (!ftrace_is_dead())
return;
- seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
+ seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
+ "# MAY BE MISSING FUNCTION EVENTS\n");
}
#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer.\n");
- seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void show_snapshot_percpu_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
#else
- seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
- seq_printf(m, "# Must use main snapshot file to allocate.\n");
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
#endif
- seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
if (iter->tr->allocated_snapshot)
- seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
else
- seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
- seq_printf(m, "# Snapshot commands:\n");
+ seq_puts(m, "# Snapshot commands:\n");
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
show_snapshot_main_help(m);
else
@@ -3273,7 +3301,7 @@ static int t_show(struct seq_file *m, void *v)
if (!t)
return 0;
- seq_printf(m, "%s", t->name);
+ seq_puts(m, t->name);
if (t->next)
seq_putc(m, ' ');
else
@@ -3363,12 +3391,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
- len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
- if (count - len < 2) {
+ len = snprintf(mask_str, count, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask));
+ if (len >= count) {
count = -EINVAL;
goto out_err;
}
- len += sprintf(mask_str + len, "\n");
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
out_err:
@@ -3699,6 +3727,7 @@ static const char readme_msg[] =
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
" set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
" max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -3917,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+ if (!ptr->map.enum_string) {
+ if (ptr->tail.next) {
+ ptr = ptr->tail.next;
+ /* Set ptr to the next real item (skip head) */
+ ptr++;
+ } else
+ return NULL;
+ }
+ return ptr;
+}
+
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ union trace_enum_map_item *ptr = v;
+
+ /*
+ * Paranoid! If ptr points to end, we don't want to increment past it.
+ * This really should never happen.
+ */
+ ptr = update_enum_map(ptr);
+ if (WARN_ON_ONCE(!ptr))
+ return NULL;
+
+ ptr++;
+
+ (*pos)++;
+
+ ptr = update_enum_map(ptr);
+
+ return ptr;
+}
+
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+ union trace_enum_map_item *v;
+ loff_t l = 0;
+
+ mutex_lock(&trace_enum_mutex);
+
+ v = trace_enum_maps;
+ if (v)
+ v++;
+
+ while (v && l < *pos) {
+ v = enum_map_next(m, v, &l);
+ }
+
+ return v;
+}
+
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static int enum_map_show(struct seq_file *m, void *v)
+{
+ union trace_enum_map_item *ptr = v;
+
+ seq_printf(m, "%s %ld (%s)\n",
+ ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.system);
+
+ return 0;
+}
+
+static const struct seq_operations tracing_enum_map_seq_ops = {
+ .start = enum_map_start,
+ .next = enum_map_next,
+ .stop = enum_map_stop,
+ .show = enum_map_show,
+};
+
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+
+static const struct file_operations tracing_enum_map_fops = {
+ .open = tracing_enum_map_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+ /* Return tail of array given the head */
+ return ptr + ptr->head.length + 1;
+}
+
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+ int len)
+{
+ struct trace_enum_map **stop;
+ struct trace_enum_map **map;
+ union trace_enum_map_item *map_array;
+ union trace_enum_map_item *ptr;
+
+ stop = start + len;
+
+ /*
+ * The trace_enum_maps contains the map plus a head and tail item,
+ * where the head holds the module and length of array, and the
+ * tail holds a pointer to the next list.
+ */
+ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+ if (!map_array) {
+ pr_warning("Unable to allocate trace enum mapping\n");
+ return;
+ }
+
+ mutex_lock(&trace_enum_mutex);
+
+ if (!trace_enum_maps)
+ trace_enum_maps = map_array;
+ else {
+ ptr = trace_enum_maps;
+ for (;;) {
+ ptr = trace_enum_jmp_to_tail(ptr);
+ if (!ptr->tail.next)
+ break;
+ ptr = ptr->tail.next;
+
+ }
+ ptr->tail.next = map_array;
+ }
+ map_array->head.mod = mod;
+ map_array->head.length = len;
+ map_array++;
+
+ for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+ map_array->map = **map;
+ map_array++;
+ }
+ memset(map_array, 0, sizeof(*map_array));
+
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+ trace_create_file("enum_map", 0444, d_tracer,
+ NULL, &tracing_enum_map_fops);
+}
+
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+ struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+
+static void trace_insert_enum_map(struct module *mod,
+ struct trace_enum_map **start, int len)
+{
+ struct trace_enum_map **map;
+
+ if (len <= 0)
+ return;
+
+ map = start;
+
+ trace_event_enum_update(map, len);
+
+ trace_insert_enum_map_file(mod, start, len);
+}
+
static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4114,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
{
static struct trace_option_dentry *topts;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir)
+ return;
+
+ /* Currently, only the top instance has options */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return;
+
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
@@ -4150,6 +4370,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
goto out;
}
+ /* If trace pipe files are being read, we can't change the tracer */
+ if (tr->current_trace->ref) {
+ ret = -EBUSY;
+ goto out;
+ }
+
trace_branch_disable();
tr->current_trace->enabled--;
@@ -4175,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
#endif
- /* Currently, only the top instance has options */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
- destroy_trace_option_files(topts);
- topts = create_trace_option_files(tr, t);
- }
+ update_tracer_options(tr, t);
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -4238,10 +4460,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
char buf[64];
int r;
@@ -4253,10 +4474,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
}
static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
unsigned long val;
int ret;
@@ -4269,6 +4489,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+ if (ret < 0)
+ goto out;
+
+ if (tr->current_trace->update_thresh) {
+ ret = tr->current_trace->update_thresh(tr);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = cnt;
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -4291,16 +4557,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
goto out;
}
- /*
- * We make a copy of the current tracer to avoid concurrent
- * changes on it while we are reading.
- */
- iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
- if (!iter->trace) {
- ret = -ENOMEM;
- goto fail;
- }
- *iter->trace = *tr->current_trace;
+ trace_seq_init(&iter->seq);
+ iter->trace = tr->current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
ret = -ENOMEM;
@@ -4327,6 +4585,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->trace->pipe_open(iter);
nonseekable_open(inode, filp);
+
+ tr->current_trace->ref++;
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -4346,6 +4606,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
+ tr->current_trace->ref--;
+
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
@@ -4353,7 +4615,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
free_cpumask_var(iter->started);
mutex_destroy(&iter->mutex);
- kfree(iter->trace);
kfree(iter);
trace_array_put(tr);
@@ -4386,7 +4647,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
-/* Must be called with trace_types_lock mutex held. */
+/* Must be called with iter->mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
struct trace_iterator *iter = filp->private_data;
@@ -4412,15 +4673,12 @@ static int tracing_wait_pipe(struct file *filp)
mutex_unlock(&iter->mutex);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, false);
mutex_lock(&iter->mutex);
if (ret)
return ret;
-
- if (signal_pending(current))
- return -EINTR;
}
return 1;
@@ -4434,7 +4692,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- struct trace_array *tr = iter->tr;
ssize_t sret;
/* return any leftover data */
@@ -4444,12 +4701,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
trace_seq_init(&iter->seq);
- /* copy the tracer to avoid using a global lock all around */
- mutex_lock(&trace_types_lock);
- if (unlikely(iter->trace->name != tr->current_trace->name))
- *iter->trace = *tr->current_trace;
- mutex_unlock(&trace_types_lock);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
@@ -4487,18 +4738,18 @@ waitagain:
trace_access_lock(iter->cpu_file);
while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
- int len = iter->seq.len;
+ int save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
/* don't print partial lines */
- iter->seq.len = len;
+ iter->seq.seq.len = save_len;
break;
}
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
- if (iter->seq.len >= cnt)
+ if (trace_seq_used(&iter->seq) >= cnt)
break;
/*
@@ -4514,7 +4765,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.readpos >= iter->seq.len)
+ if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -4548,20 +4799,33 @@ static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
size_t count;
+ int save_len;
int ret;
/* Seq buffer is page-sized, exactly what we need. */
for (;;) {
- count = iter->seq.len;
+ save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
- count = iter->seq.len - count;
- if (rem < count) {
- rem = 0;
- iter->seq.len -= count;
+
+ if (trace_seq_has_overflowed(&iter->seq)) {
+ iter->seq.seq.len = save_len;
break;
}
+
+ /*
+ * This should not be hit, because it should only
+ * be set if the iter->seq overflowed. But check it
+ * anyway to be safe.
+ */
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- iter->seq.len -= count;
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ count = trace_seq_used(&iter->seq) - save_len;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.seq.len = save_len;
break;
}
@@ -4596,7 +4860,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
- struct trace_array *tr = iter->tr;
ssize_t ret;
size_t rem;
unsigned int i;
@@ -4604,12 +4867,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
- /* copy the tracer to avoid using a global lock all around */
- mutex_lock(&trace_types_lock);
- if (unlikely(iter->trace->name != tr->current_trace->name))
- *iter->trace = *tr->current_trace;
- mutex_unlock(&trace_types_lock);
-
mutex_lock(&iter->mutex);
if (iter->trace->splice_read) {
@@ -4642,13 +4899,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- iter->seq.len);
+ trace_seq_used(&iter->seq));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = iter->seq.len;
+ spd.partial[i].len = trace_seq_used(&iter->seq);
trace_seq_init(&iter->seq);
}
@@ -4896,7 +5153,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
*fpos += written;
out_unlock:
- for (i = 0; i < nr_pages; i++){
+ for (i = nr_pages - 1; i >= 0; i--) {
kunmap_atomic(map_page[i]);
put_page(pages[i]);
}
@@ -5170,6 +5427,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
#endif /* CONFIG_TRACER_SNAPSHOT */
+static const struct file_operations tracing_thresh_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_thresh_read,
+ .write = tracing_thresh_write,
+ .llseek = generic_file_llseek,
+};
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -5278,6 +5542,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
filp->private_data = info;
+ tr->current_trace->ref++;
+
mutex_unlock(&trace_types_lock);
ret = nonseekable_open(inode, filp);
@@ -5308,21 +5574,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!count)
return 0;
- mutex_lock(&trace_types_lock);
-
#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
- size = -EBUSY;
- goto out_unlock;
- }
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ return -EBUSY;
#endif
if (!info->spare)
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- size = -ENOMEM;
if (!info->spare)
- goto out_unlock;
+ return -ENOMEM;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -5338,25 +5599,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0) {
if (trace_empty(iter)) {
- if ((filp->f_flags & O_NONBLOCK)) {
- size = -EAGAIN;
- goto out_unlock;
- }
- mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
- mutex_lock(&trace_types_lock);
- if (ret) {
- size = ret;
- goto out_unlock;
- }
- if (signal_pending(current)) {
- size = -EINTR;
- goto out_unlock;
- }
+ if ((filp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ ret = wait_on_pipe(iter, false);
+ if (ret)
+ return ret;
+
goto again;
}
- size = 0;
- goto out_unlock;
+ return 0;
}
info->read = 0;
@@ -5366,18 +5618,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
size = count;
ret = copy_to_user(ubuf, info->spare + info->read, size);
- if (ret == size) {
- size = -EFAULT;
- goto out_unlock;
- }
+ if (ret == size)
+ return -EFAULT;
+
size -= ret;
*ppos += size;
info->read += size;
- out_unlock:
- mutex_unlock(&trace_types_lock);
-
return size;
}
@@ -5388,6 +5636,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
+ iter->tr->current_trace->ref--;
+
__trace_array_put(iter->tr);
if (info->spare)
@@ -5471,32 +5721,22 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
};
struct buffer_ref *ref;
int entries, size, i;
- ssize_t ret;
-
- mutex_lock(&trace_types_lock);
+ ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto out;
- }
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd)) {
- ret = -ENOMEM;
- goto out;
- }
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
- if (*ppos & (PAGE_SIZE - 1)) {
- ret = -EINVAL;
- goto out;
- }
+ if (*ppos & (PAGE_SIZE - 1))
+ return -EINVAL;
if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE) {
- ret = -EINVAL;
- goto out;
- }
+ if (len < PAGE_SIZE)
+ return -EINVAL;
len &= PAGE_MASK;
}
@@ -5509,13 +5749,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int r;
ref = kzalloc(sizeof(*ref), GFP_KERNEL);
- if (!ref)
+ if (!ref) {
+ ret = -ENOMEM;
break;
+ }
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
+ ret = -ENOMEM;
kfree(ref);
break;
}
@@ -5553,26 +5796,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
- if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
- ret = -EAGAIN;
- goto out;
- }
- mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
- mutex_lock(&trace_types_lock);
if (ret)
- goto out;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+ return ret;
+
+ if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
+ return -EAGAIN;
+
+ ret = wait_on_pipe(iter, true);
+ if (ret)
+ return ret;
+
goto again;
}
ret = splice_to_pipe(pipe, &spd);
splice_shrink_spd(&spd);
-out:
- mutex_unlock(&trace_types_lock);
return ret;
}
@@ -5642,7 +5880,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "read events: %ld\n", cnt);
- count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+ count = simple_read_from_buffer(ubuf, count, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -5723,10 +5962,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%ps:", (void *)ip);
- seq_printf(m, "snapshot");
+ seq_puts(m, "snapshot");
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
@@ -5801,28 +6040,19 @@ static __init int register_snapshot_cmd(void)
static inline __init int register_snapshot_cmd(void) { return 0; }
#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
+static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (tr->dir)
- return tr->dir;
-
- if (!debugfs_initialized())
- return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+ /* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- tr->dir = debugfs_create_dir("tracing", NULL);
-
- if (!tr->dir)
- pr_warn_once("Could not create debugfs directory 'tracing'\n");
+ return NULL;
+ /* All sub buffers have a descriptor */
return tr->dir;
}
-struct dentry *tracing_init_dentry(void)
-{
- return tracing_init_dentry_tr(&global_trace);
-}
-
static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
struct dentry *d_tracer;
@@ -5830,14 +6060,14 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
if (tr->percpu_dir)
return tr->percpu_dir;
- d_tracer = tracing_init_dentry_tr(tr);
- if (!d_tracer)
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
return NULL;
- tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
WARN_ONCE(!tr->percpu_dir,
- "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+ "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
}
@@ -5854,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
}
static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
@@ -5864,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
return;
snprintf(cpu_dir, 30, "cpu%ld", cpu);
- d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+ d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
- pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+ pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
return;
}
@@ -6018,9 +6248,9 @@ struct dentry *trace_create_file(const char *name,
{
struct dentry *ret;
- ret = debugfs_create_file(name, mode, parent, data, fops);
+ ret = tracefs_create_file(name, mode, parent, data, fops);
if (!ret)
- pr_warning("Could not create debugfs '%s' entry\n", name);
+ pr_warning("Could not create tracefs '%s' entry\n", name);
return ret;
}
@@ -6033,13 +6263,13 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
if (tr->options)
return tr->options;
- d_tracer = tracing_init_dentry_tr(tr);
- if (!d_tracer)
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
return NULL;
- tr->options = debugfs_create_dir("options", d_tracer);
+ tr->options = tracefs_create_dir("options", d_tracer);
if (!tr->options) {
- pr_warning("Could not create debugfs directory 'options'\n");
+ pr_warning("Could not create tracefs directory 'options'\n");
return NULL;
}
@@ -6107,10 +6337,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
if (!topts)
return;
- for (cnt = 0; topts[cnt].opt; cnt++) {
- if (topts[cnt].entry)
- debugfs_remove(topts[cnt].entry);
- }
+ for (cnt = 0; topts[cnt].opt; cnt++)
+ tracefs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6199,7 +6427,7 @@ static const struct file_operations rb_simple_fops = {
struct dentry *trace_instance_dir;
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6276,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
{
struct trace_array *tr;
int ret;
@@ -6315,17 +6543,17 @@ static int new_instance_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ tr->dir = tracefs_create_dir(name, trace_instance_dir);
if (!tr->dir)
goto out_free_tr;
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
goto out_free_tr;
}
- init_tracer_debugfs(tr, tr->dir);
+ init_tracer_tracefs(tr, tr->dir);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6346,7 +6574,7 @@ static int new_instance_create(const char *name)
}
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
{
struct trace_array *tr;
int found = 0;
@@ -6365,7 +6593,7 @@ static int instance_delete(const char *name)
goto out_unlock;
ret = -EBUSY;
- if (tr->ref)
+ if (tr->ref || (tr->current_trace && tr->current_trace->ref))
goto out_unlock;
list_del(&tr->list);
@@ -6387,82 +6615,17 @@ static int instance_delete(const char *name)
return ret;
}
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the new_instance_create() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = new_instance_create(dentry->d_iname);
-
- mutex_lock(&inode->i_mutex);
-
- return ret;
-}
-
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /* The caller did a dget() on dentry */
- mutex_unlock(&dentry->d_inode->i_mutex);
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the instance_delete() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = instance_delete(dentry->d_iname);
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
-
- return ret;
-}
-
-static const struct inode_operations instance_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = instance_mkdir,
- .rmdir = instance_rmdir,
-};
-
static __init void create_trace_instances(struct dentry *d_tracer)
{
- trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+ instance_mkdir,
+ instance_rmdir);
if (WARN_ON(!trace_instance_dir))
return;
-
- /* Hijack the dir inode operations, to allow mkdir */
- trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
}
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
int cpu;
@@ -6516,24 +6679,162 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
#endif
for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(tr, cpu);
+ tracing_init_tracefs_percpu(tr, cpu);
+
+}
+
+static struct vfsmount *trace_automount(void *ingore)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *type;
+
+ /*
+ * To maintain backward compatibility for tools that mount
+ * debugfs to get to the tracing facility, tracefs is automatically
+ * mounted to the debugfs/tracing directory.
+ */
+ type = get_fs_type("tracefs");
+ if (!type)
+ return NULL;
+ mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return NULL;
+ mntget(mnt);
+
+ return mnt;
+}
+
+/**
+ * tracing_init_dentry - initialize top level trace array
+ *
+ * This is called when creating files or directories in the tracing
+ * directory. It is called via fs_initcall() by any of the boot up code
+ * and expects to return the dentry of the top level tracing directory.
+ */
+struct dentry *tracing_init_dentry(void)
+{
+ struct trace_array *tr = &global_trace;
+
+ /* The top level trace array uses NULL as parent */
+ if (tr->dir)
+ return NULL;
+
+ if (WARN_ON(!debugfs_initialized()))
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * As there may still be users that expect the tracing
+ * files to exist in debugfs/tracing, we must automount
+ * the tracefs file system there, so older tools still
+ * work with the newer kerenl.
+ */
+ tr->dir = debugfs_create_automount("tracing", NULL,
+ trace_automount, NULL);
+ if (!tr->dir) {
+ pr_warn_once("Could not create debugfs directory 'tracing'\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return NULL;
+}
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+
+static void __init trace_enum_init(void)
+{
+ int len;
+
+ len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+ trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+ if (!mod->num_trace_enums)
+ return;
+
+ /*
+ * Modules with bad taint do not have events created, do
+ * not bother with enums either.
+ */
+ if (trace_module_has_bad_taint(mod))
+ return;
+
+ trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+}
+
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+ union trace_enum_map_item *map;
+ union trace_enum_map_item **last = &trace_enum_maps;
+
+ if (!mod->num_trace_enums)
+ return;
+
+ mutex_lock(&trace_enum_mutex);
+
+ map = trace_enum_maps;
+
+ while (map) {
+ if (map->head.mod == mod)
+ break;
+ map = trace_enum_jmp_to_tail(map);
+ last = &map->tail.next;
+ map = map->tail.next;
+ }
+ if (!map)
+ goto out;
+
+ *last = trace_enum_jmp_to_tail(map)->tail.next;
+ kfree(map);
+ out:
+ mutex_unlock(&trace_enum_mutex);
}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_enums(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_enums(mod);
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+#endif /* CONFIG_MODULES */
-static __init int tracer_init_debugfs(void)
+static __init int tracer_init_tracefs(void)
{
struct dentry *d_tracer;
trace_access_lock_init();
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- init_tracer_debugfs(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
+ &global_trace, &tracing_thresh_fops);
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
@@ -6544,6 +6845,14 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
+ trace_enum_init();
+
+ trace_create_enum_file(d_tracer);
+
+#ifdef CONFIG_MODULES
+ register_module_notifier(&trace_module_nb);
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6553,6 +6862,10 @@ static __init int tracer_init_debugfs(void)
create_trace_options_dir(&global_trace);
+ /* If the tracer was started via cmdline, create options for it here */
+ if (global_trace.current_trace != &nop_trace)
+ update_tracer_options(&global_trace, global_trace.current_trace);
+
return 0;
}
@@ -6607,11 +6920,19 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= TRACE_MAX_PRINT)
- s->len = TRACE_MAX_PRINT;
+ if (s->seq.len >= TRACE_MAX_PRINT)
+ s->seq.len = TRACE_MAX_PRINT;
+
+ /*
+ * More paranoid code. Although the buffer size is set to
+ * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+ * an extra layer of protection.
+ */
+ if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+ s->seq.len = s->seq.size - 1;
/* should be zero ended, but we are paranoid. */
- s->buffer[s->len] = 0;
+ s->buffer[s->seq.len] = 0;
printk(KERN_TRACE "%s", s->buffer);
@@ -6752,7 +7073,6 @@ __init static int tracer_alloc_buffers(void)
int ring_buf_size;
int ret = -ENOMEM;
-
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -6850,6 +7170,18 @@ out:
return ret;
}
+void __init trace_init(void)
+{
+ if (tracepoint_printk) {
+ tracepoint_print_iter =
+ kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ if (WARN_ON(!tracepoint_print_iter))
+ tracepoint_printk = 0;
+ }
+ tracer_alloc_buffers();
+ trace_event_init();
+}
+
__init static int clear_boot_tracer(void)
{
/*
@@ -6869,6 +7201,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-early_initcall(tracer_alloc_buffers);
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..d2612016de94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
#include <linux/compiler.h>
+#include <linux/trace_seq.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -333,12 +334,13 @@ struct tracer_flags {
/**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @close: called when the trace file is released
@@ -357,6 +359,7 @@ struct tracer {
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
+ int (*update_thresh)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
@@ -385,6 +388,7 @@ struct tracer {
struct tracer *next;
struct tracer_flags *flags;
int enabled;
+ int ref;
bool print_max;
bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -538,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
@@ -567,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc);
-
-void tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *cur,
- unsigned long flags, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -595,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
-void tracing_sched_switch_assign_trace(struct trace_array *tr);
-void tracing_stop_sched_switch_record(void);
-void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -717,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
+extern char trace_find_mark(unsigned long long duration);
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -735,7 +728,7 @@ extern unsigned long trace_flags;
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
-extern enum print_line_t
+extern void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
@@ -1308,4 +1301,20 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
#define perf_ftrace_event_register NULL
#endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
+#else
+static inline void __init trace_event_init(void) { }
+static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+#endif
+
+extern struct trace_iterator *tracepoint_print_iter;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/irqflags.h>
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
@@ -151,22 +150,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
static void branch_print_header(struct seq_file *s)
{
seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
- " FUNC:FILE:LINE\n");
- seq_puts(s, "# | | | | | "
- " |\n");
+ " FUNC:FILE:LINE\n"
+ "# | | | | | "
+ " |\n");
}
static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +231,12 @@ extern unsigned long __stop_annotated_branch_profile[];
static int annotated_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " correct incorrect % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
@@ -274,7 +272,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
- seq_printf(m, " X ");
+ seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +360,12 @@ extern unsigned long __stop_branch_profile[];
static int all_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " miss hit %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " miss hit % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->fmt),
FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
__field( const char *, str )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->str),
FILTER_OTHER
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 5d12bb407b44..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return ret;
}
+ /*
+ * We checked and allowed to create parent,
+ * allow children without checking.
+ */
+ if (p_event->parent)
+ return 0;
+
+ /*
+ * It's ok to check current process (owner) permissions in here,
+ * because code below is called only via perf_event_open syscall.
+ */
+
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -249,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
}
void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+ struct pt_regs **regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -268,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
if (*rctxp < 0)
return NULL;
+ if (regs)
+ *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2de53628689f..3ab69fb72b85 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,10 +8,12 @@
*
*/
+#define pr_fmt(fmt) fmt
+
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
@@ -210,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+ struct ftrace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ if (!iter)
+ return;
+
+ event_call = fbuffer->ftrace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->ftrace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
{
+ if (tracepoint_printk)
+ output_printk(fbuffer);
+
event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
@@ -446,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
+ tracefs_remove_recursive(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -459,13 +493,13 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dir->d_subdirs, d_child) {
if (child->d_inode) /* probably unneeded */
child->d_inode->i_private = NULL;
}
spin_unlock(&dir->d_lock);
- debugfs_remove_recursive(dir);
+ tracefs_remove_recursive(dir);
}
list_del(&file->list);
@@ -531,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
+ int ret;
/*
* The buf format can be <subsystem>:<event-name>
@@ -556,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
event = NULL;
}
- return __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+
+ /* Put back the colon to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ':';
+
+ return ret;
}
/**
@@ -916,7 +957,7 @@ static int f_show(struct seq_file *m, void *v)
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
- seq_printf(m, "format:\n");
+ seq_puts(m, "format:\n");
return 0;
case FORMAT_FIELD_SEPERATOR:
@@ -1042,7 +1083,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_unlock(&event_mutex);
if (file)
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1208,7 +1250,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
print_subsystem_event_filter(system, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1263,7 +1306,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
func(s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1489,9 +1533,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = debugfs_create_dir(name, parent);
+ dir->entry = tracefs_create_dir(name, parent);
if (!dir->entry) {
- pr_warning("Failed to create system directory %s\n", name);
+ pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
@@ -1502,12 +1546,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1522,8 +1566,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
out_fail:
/* Only print this message if failed on memory allocation */
if (!dir || !system)
- pr_warning("No memory to create event subsystem %s\n",
- name);
+ pr_warn("No memory to create event subsystem %s\n", name);
return NULL;
}
@@ -1549,10 +1592,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
d_events = parent;
name = ftrace_event_name(call);
- file->dir = debugfs_create_dir(name, d_events);
+ file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
- pr_warning("Could not create debugfs '%s' directory\n",
- name);
+ pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
@@ -1575,8 +1617,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
if (list_empty(head)) {
ret = call->class->define_fields(call);
if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", name);
+ pr_warn("Could not initialize trace point events/%s\n",
+ name);
return -1;
}
}
@@ -1621,7 +1663,6 @@ static void event_remove(struct ftrace_event_call *call)
if (file->event_call != call)
continue;
ftrace_event_enable_disable(file, 0);
- destroy_preds(file);
/*
* The do_for_each_event_file() is
* a double loop. After finding the call for this
@@ -1649,8 +1690,7 @@ static int event_init(struct ftrace_event_call *call)
if (call->class->raw_init) {
ret = call->class->raw_init(call);
if (ret < 0 && ret != -ENOSYS)
- pr_warn("Could not initialize trace events/%s\n",
- name);
+ pr_warn("Could not initialize trace events/%s\n", name);
}
return ret;
@@ -1671,6 +1711,131 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
return 0;
}
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+ int rlen;
+ int elen;
+
+ /* Find the length of the enum value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Make sure there's enough room to replace the string with the value */
+ if (len < elen)
+ return NULL;
+
+ snprintf(ptr, elen + 1, "%ld", map->enum_value);
+
+ /* Get the rest of the string of ptr */
+ rlen = strlen(ptr + len);
+ memmove(ptr + elen, ptr + len, rlen);
+ /* Make sure we end the new string */
+ ptr[elen + rlen] = 0;
+
+ return ptr + elen;
+}
+
+static void update_event_printk(struct ftrace_event_call *call,
+ struct trace_enum_map *map)
+{
+ char *ptr;
+ int quote = 0;
+ int len = strlen(map->enum_string);
+
+ for (ptr = call->print_fmt; *ptr; ptr++) {
+ if (*ptr == '\\') {
+ ptr++;
+ /* paranoid */
+ if (!*ptr)
+ break;
+ continue;
+ }
+ if (*ptr == '"') {
+ quote ^= 1;
+ continue;
+ }
+ if (quote)
+ continue;
+ if (isdigit(*ptr)) {
+ /* skip numbers */
+ do {
+ ptr++;
+ /* Check for alpha chars like ULL */
+ } while (isalnum(*ptr));
+ if (!*ptr)
+ break;
+ /*
+ * A number must have some kind of delimiter after
+ * it, and we can ignore that too.
+ */
+ continue;
+ }
+ if (isalpha(*ptr) || *ptr == '_') {
+ if (strncmp(map->enum_string, ptr, len) == 0 &&
+ !isalnum(ptr[len]) && ptr[len] != '_') {
+ ptr = enum_replace(ptr, map, len);
+ /* Hmm, enum string smaller than value */
+ if (WARN_ON_ONCE(!ptr))
+ return;
+ /*
+ * No need to decrement here, as enum_replace()
+ * returns the pointer to the character passed
+ * the enum, and two enums can not be placed
+ * back to back without something in between.
+ * We can skip that something in between.
+ */
+ continue;
+ }
+ skip_more:
+ do {
+ ptr++;
+ } while (isalnum(*ptr) || *ptr == '_');
+ if (!*ptr)
+ break;
+ /*
+ * If what comes after this variable is a '.' or
+ * '->' then we can continue to ignore that string.
+ */
+ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+ ptr += *ptr == '.' ? 1 : 2;
+ if (!*ptr)
+ break;
+ goto skip_more;
+ }
+ /*
+ * Once again, we can skip the delimiter that came
+ * after the string.
+ */
+ continue;
+ }
+ }
+}
+
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+ struct ftrace_event_call *call, *p;
+ const char *last_system = NULL;
+ int last_i;
+ int i;
+
+ down_write(&trace_event_sem);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ /* events are usually grouped together with systems */
+ if (!last_system || call->class->system != last_system) {
+ last_i = 0;
+ last_system = call->class->system;
+ }
+
+ for (i = last_i; i < len; i++) {
+ if (call->class->system == map[i]->system) {
+ /* Save the first system if need be */
+ if (!last_i)
+ last_i = i;
+ update_event_printk(call, map[i]);
+ }
+ }
+ }
+ up_write(&trace_event_sem);
+}
+
static struct ftrace_event_file *
trace_create_new_event(struct ftrace_event_call *call,
struct trace_array *tr)
@@ -1749,7 +1914,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- destroy_call_preds(call);
+ free_event_filter(call->filter);
+ call->filter = NULL;
}
static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1881,7 +2047,7 @@ static int trace_module_notify(struct notifier_block *self,
static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
- .priority = 0,
+ .priority = 1, /* higher than trace.c module notify */
};
#endif /* CONFIG_MODULES */
@@ -1895,8 +2061,8 @@ __trace_add_event_dirs(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(call));
}
}
@@ -1989,7 +2155,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
ftrace_event_name(data->file->event_call));
if (data->count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", data->count);
@@ -2194,7 +2360,7 @@ static inline int register_event_cmds(void) { return 0; }
/*
* The top level array has already had its ftrace_event_file
* descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
* initialized, and we now have to create the files associated
* to the events.
*/
@@ -2208,8 +2374,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(file->event_call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
}
}
@@ -2232,8 +2398,8 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create early event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create early event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2277,16 +2443,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = debugfs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warning("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create tracefs 'set_event' entry\n");
return -ENOMEM;
}
- d_events = debugfs_create_dir("events", parent);
+ d_events = tracefs_create_dir("events", parent);
if (!d_events) {
- pr_warning("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
@@ -2378,7 +2544,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- debugfs_remove_recursive(tr->event_dir);
+ tracefs_remove_recursive(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -2395,12 +2561,39 @@ static __init int event_trace_memsetup(void)
return 0;
}
+static __init void
+early_enable_events(struct trace_array *tr, bool disable_first)
+{
+ char *buf = bootup_event_buf;
+ char *token;
+ int ret;
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ /* Restarting syscalls requires that we stop them first */
+ if (disable_first)
+ ftrace_set_clr_event(tr, token, 0);
+
+ ret = ftrace_set_clr_event(tr, token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+
+ /* Put back the comma to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ',';
+ }
+}
+
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
struct ftrace_event_call **iter, *call;
- char *buf = bootup_event_buf;
- char *token;
int ret;
if (!tr)
@@ -2422,18 +2615,7 @@ static __init int event_trace_enable(void)
*/
__trace_early_add_events(tr);
- while (true) {
- token = strsep(&buf, ",");
-
- if (!token)
- break;
- if (!*token)
- continue;
-
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
- }
+ early_enable_events(tr, false);
trace_printk_start_comm();
@@ -2444,6 +2626,31 @@ static __init int event_trace_enable(void)
return 0;
}
+/*
+ * event_trace_enable() is called from trace_event_init() first to
+ * initialize events and perhaps start any events that are on the
+ * command line. Unfortunately, there are some events that will not
+ * start this early, like the system call tracepoints that need
+ * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
+ * is called before pid 1 starts, and this flag is never set, making
+ * the syscall tracepoint never get reached, but the event is enabled
+ * regardless (and not doing anything).
+ */
+static __init int event_trace_enable_again(void)
+{
+ struct trace_array *tr;
+
+ tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
+ early_enable_events(tr, true);
+
+ return 0;
+}
+
+early_initcall(event_trace_enable_again);
+
static __init int event_trace_init(void)
{
struct trace_array *tr;
@@ -2456,17 +2663,16 @@ static __init int event_trace_init(void)
return -ENODEV;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("available_events", 0444, d_tracer,
+ entry = tracefs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warning("Could not create debugfs "
- "'available_events' entry\n");
+ pr_warn("Could not create tracefs 'available_events' entry\n");
if (trace_define_common_fields())
- pr_warning("tracing: Failed to allocate common fields");
+ pr_warn("tracing: Failed to allocate common fields");
ret = early_event_add_tracer(d_tracer, tr);
if (ret)
@@ -2475,12 +2681,18 @@ static __init int event_trace_init(void)
#ifdef CONFIG_MODULES
ret = register_module_notifier(&trace_module_nb);
if (ret)
- pr_warning("Failed to register trace events module notifier\n");
+ pr_warn("Failed to register trace events module notifier\n");
#endif
return 0;
}
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2515,8 +2727,11 @@ static __init int event_test_thread(void *unused)
kfree(test_malloc);
set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop())
+ while (!kthread_should_stop()) {
schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
return 0;
}
@@ -2579,7 +2794,7 @@ static __init void event_trace_self_tests(void)
* it and the self test should not be on.
*/
if (file->flags & FTRACE_EVENT_FL_ENABLED) {
- pr_warning("Enabled event during self test!\n");
+ pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
@@ -2607,8 +2822,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling system %s\n",
- system->name);
+ pr_warn("error enabling system %s\n",
+ system->name);
continue;
}
@@ -2616,8 +2831,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling system %s\n",
- system->name);
+ pr_warn("error disabling system %s\n",
+ system->name);
continue;
}
@@ -2631,7 +2846,7 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling all events\n");
+ pr_warn("error enabling all events\n");
return;
}
@@ -2640,7 +2855,7 @@ static __init void event_trace_self_tests(void)
/* reset sysname */
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling all events\n");
+ pr_warn("error disabling all events\n");
return;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
OP_GT,
OP_GE,
OP_BAND,
+ OP_NOT,
OP_NONE,
OP_OPEN_PAREN,
};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
{ OP_GT, ">", 5 },
{ OP_GE, ">=", 5 },
{ OP_BAND, "&", 6 },
+ { OP_NOT, "!", 6 },
{ OP_NONE, "OP_NONE", 0 },
{ OP_OPEN_PAREN, "(", 0 },
};
@@ -85,6 +87,7 @@ enum {
FILT_ERR_MISSING_FIELD,
FILT_ERR_INVALID_FILTER,
FILT_ERR_IP_FIELD_ONLY,
+ FILT_ERR_ILLEGAL_NOT_OP,
};
static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
"Missing field name and/or value",
"Meaningless filter expression",
"Only 'ip' field is supported for function trace",
+ "Illegal use of '!'",
};
struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
int index;
};
+/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
break; \
} \
\
- return match; \
+ return !!match == !pred->not; \
}
#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
if (!WARN_ON_ONCE(!pred->fn))
match = pred->fn(pred, rec);
if (!!match == type)
- return match;
+ break;
}
- return match;
+ /* If not of not match is equal to not of not, then it is a match */
+ return !!match == !op->not;
}
struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
* then this op can be folded.
*/
if (left->index & FILTER_PRED_FOLD &&
- (left->op == dest->op ||
+ ((left->op == dest->op && !left->not) ||
left->left == FILTER_PRED_INVALID) &&
right->index & FILTER_PRED_FOLD &&
- (right->op == dest->op ||
+ ((right->op == dest->op && !right->not) ||
right->left == FILTER_PRED_INVALID))
dest->index |= FILTER_PRED_FOLD;
@@ -774,17 +780,12 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void call_filter_disable(struct ftrace_event_call *call)
-{
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
-}
-
static void filter_disable(struct ftrace_event_file *file)
{
struct ftrace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call_filter_disable(call);
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
else
file->flags &= ~FTRACE_EVENT_FL_FILTERED;
}
@@ -804,32 +805,6 @@ void free_event_filter(struct event_filter *filter)
__free_filter(filter);
}
-void destroy_call_preds(struct ftrace_event_call *call)
-{
- __free_filter(call->filter);
- call->filter = NULL;
-}
-
-static void destroy_file_preds(struct ftrace_event_file *file)
-{
- __free_filter(file->filter);
- file->filter = NULL;
-}
-
-/*
- * Called when destroying the ftrace_event_file.
- * The file is being freed, so we do not need to worry about
- * the file being currently used. This is for module code removing
- * the tracepoints from within it.
- */
-void destroy_preds(struct ftrace_event_file *file)
-{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- destroy_call_preds(file->event_call);
- else
- destroy_file_preds(file);
-}
-
static struct event_filter *__alloc_filter(void)
{
struct event_filter *filter;
@@ -873,17 +848,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
remove_filter_string(file->filter);
}
-static void filter_free_subsystem_preds(struct event_subsystem *system,
+static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
-
__remove_filter(file);
}
}
@@ -901,15 +873,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
}
}
-static void filter_free_subsystem_filters(struct event_subsystem *system,
+static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
__free_subsystem_filter(file);
}
@@ -1064,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
}
if (pred->op == OP_NE)
- pred->not = 1;
+ pred->not ^= 1;
pred->fn = fn;
return 0;
@@ -1582,7 +1552,6 @@ static int fold_pred_tree(struct event_filter *filter,
static int replace_preds(struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
- char *filter_string,
bool dry_run)
{
char *operand1 = NULL, *operand2 = NULL;
@@ -1627,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
continue;
}
+ if (elt->op == OP_NOT) {
+ if (!n_preds || operand1 || operand2) {
+ parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ if (!dry_run)
+ filter->preds[n_preds - 1].not ^= 1;
+ continue;
+ }
+
if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
err = -ENOSPC;
@@ -1755,13 +1735,12 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct event_subsystem *system,
+static int replace_system_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
struct filter_list *filter_item;
struct filter_list *tmp;
LIST_HEAD(filter_list);
@@ -1769,15 +1748,14 @@ static int replace_system_preds(struct event_subsystem *system,
int err;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
/*
* Try to see if the filter can be applied
* (filter arg is ignored on dry_run)
*/
- err = replace_preds(call, NULL, ps, filter_string, true);
+ err = replace_preds(file->event_call, NULL, ps, true);
if (err)
event_set_no_set_filter_flag(file);
else
@@ -1787,9 +1765,7 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(file, &tr->events, list) {
struct event_filter *filter;
- call = file->event_call;
-
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
if (event_no_set_filter_flag(file))
@@ -1811,7 +1787,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
goto fail_mem;
- err = replace_preds(call, filter, ps, filter_string, false);
+ err = replace_preds(file->event_call, filter, ps, false);
if (err) {
filter_disable(file);
parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1909,7 @@ static int create_filter(struct ftrace_event_call *call,
err = create_filter_start(filter_str, set_str, &ps, &filter);
if (!err) {
- err = replace_preds(call, filter, ps, filter_str, false);
+ err = replace_preds(call, filter, ps, false);
if (err && set_str)
append_filter_err(ps, filter);
}
@@ -1959,7 +1935,7 @@ int create_event_filter(struct ftrace_event_call *call,
* Identical to create_filter() except that it creates a subsystem filter
* and always remembers @filter_str.
*/
-static int create_system_filter(struct event_subsystem *system,
+static int create_system_filter(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
@@ -1969,7 +1945,7 @@ static int create_system_filter(struct event_subsystem *system,
err = create_filter_start(filter_str, true, &ps, &filter);
if (!err) {
- err = replace_system_preds(system, tr, ps, filter_str);
+ err = replace_system_preds(dir, tr, ps, filter_str);
if (!err) {
/* System filters just show a default message */
kfree(filter->filter_string);
@@ -2053,18 +2029,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
}
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, tr);
+ filter_free_subsystem_preds(dir, tr);
remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
synchronize_sched();
- filter_free_subsystem_filters(system, tr);
+ filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
goto out_unlock;
}
- err = create_system_filter(system, tr, filter_string, &filter);
+ err = create_system_filter(dir, tr, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
{
long count = (long)data;
- seq_printf(m, "%s", name);
+ seq_puts(m, name);
if (count == -1)
seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
if (filter_str)
seq_printf(m, " if %s\n", filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
if (data->filter_str)
seq_printf(m, " if %s\n", data->filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..174a6a71146c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
#include <linux/stringify.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/fs.h>
#include "trace_output.h"
@@ -179,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \
}, \
.event.type = etype, \
.print_fmt = print, \
- .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
struct ftrace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static int update_count(void **data)
+static void update_traceon_count(void **data, bool on)
{
- unsigned long *count = (long *)data;
+ long *count = (long *)data;
+ long old_count = *count;
- if (!*count)
- return 0;
+ /*
+ * Tracing gets disabled (or enabled) once per count.
+ * This function can be called at the same time on multiple CPUs.
+ * It is fine if both disable (or enable) tracing, as disabling
+ * (or enabling) the second time doesn't do anything as the
+ * state of the tracer is already disabled (or enabled).
+ * What needs to be synchronized in this case is that the count
+ * only gets decremented once, even if the tracer is disabled
+ * (or enabled) twice, as the second one is really a nop.
+ *
+ * The memory barriers guarantee that we only decrement the
+ * counter once. First the count is read to a local variable
+ * and a read barrier is used to make sure that it is loaded
+ * before checking if the tracer is in the state we want.
+ * If the tracer is not in the state we want, then the count
+ * is guaranteed to be the old count.
+ *
+ * Next the tracer is set to the state we want (disabled or enabled)
+ * then a write memory barrier is used to make sure that
+ * the new state is visible before changing the counter by
+ * one minus the old counter. This guarantees that another CPU
+ * executing this code will see the new state before seeing
+ * the new counter value, and would not do anything if the new
+ * counter is seen.
+ *
+ * Note, there is no synchronization between this and a user
+ * setting the tracing_on file. But we currently don't care
+ * about that.
+ */
+ if (!old_count)
+ return;
- if (*count != -1)
- (*count)--;
+ /* Make sure we see count before checking tracing state */
+ smp_rmb();
- return 1;
+ if (on == !!tracing_is_on())
+ return;
+
+ if (on)
+ tracing_on();
+ else
+ tracing_off();
+
+ /* unlimited? */
+ if (old_count == -1)
+ return;
+
+ /* Make sure tracing state is visible before updating count */
+ smp_wmb();
+
+ *count = old_count - 1;
}
static void
ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_on();
+ update_traceon_count(data, 1);
}
static void
ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_off();
+ update_traceon_count(data, 0);
}
static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
static void
ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
+ long *count = (long *)data;
+ long old_count;
+ long new_count;
- if (update_count(data))
- trace_dump_stack(STACK_SKIP);
+ /*
+ * Stack traces should only execute the number of times the
+ * user specified in the counter.
+ */
+ do {
+
+ if (!tracing_is_on())
+ return;
+
+ old_count = *count;
+
+ if (!old_count)
+ return;
+
+ /* unlimited? */
+ if (old_count == -1) {
+ trace_dump_stack(STACK_SKIP);
+ return;
+ }
+
+ new_count = old_count - 1;
+ new_count = cmpxchg(count, old_count, new_count);
+ if (new_count == old_count)
+ trace_dump_stack(STACK_SKIP);
+
+ } while (new_count != old_count);
+}
+
+static int update_count(void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return 0;
+
+ if (*count != -1)
+ (*count)--;
+
+ return 1;
}
static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..a51e79688455 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
*/
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -15,6 +14,33 @@
#include "trace.h"
#include "trace_output.h"
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
/* When set, irq functions will be ignored */
static int ftrace_graph_skip_irqs;
@@ -80,7 +106,7 @@ enum {
FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
};
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags);
@@ -92,6 +118,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
unsigned long long calltime;
int index;
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
if (!current->ret_stack)
return -EBUSY;
@@ -121,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
* The curr_ret_stack is initialized to -1 and get increased
* in this function. So it can be less than -1 only if it was
* filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in debugfs by user.
+ * set from set_graph_notrace file in tracefs by user.
*/
if (current->curr_ret_stack < -1)
return -EBUSY;
@@ -323,7 +352,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
{
if (tracing_thresh)
return 1;
@@ -412,7 +441,7 @@ void set_graph_array(struct trace_array *tr)
smp_mb();
}
-void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
@@ -445,35 +474,32 @@ static void graph_trace_reset(struct trace_array *tr)
unregister_ftrace_graph();
}
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+ graph_trace_reset(tr);
+ return graph_trace_init(tr);
+}
+
static int max_bytes_for_cpu;
-static enum print_line_t
-print_graph_cpu(struct trace_seq *s, int cpu)
+static void print_graph_cpu(struct trace_seq *s, int cpu)
{
- int ret;
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
}
#define TRACE_GRAPH_PROCINFO_LENGTH 14
-static enum print_line_t
-print_graph_proc(struct trace_seq *s, pid_t pid)
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
- int ret;
int len;
int i;
@@ -488,56 +514,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
/* First spaces to align center */
- for (i = 0; i < spaces / 2; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < spaces / 2; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s-%s", comm, pid_str);
/* Last spaces to align center */
- for (i = 0; i < spaces - (spaces / 2); i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = 0; i < spaces - (spaces / 2); i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
-print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- if (!trace_seq_putc(s, ' '))
- return 0;
-
- return trace_print_lat_fmt(s, entry);
+ trace_seq_putc(s, ' ');
+ trace_print_lat_fmt(s, entry);
}
/* If the pid changed since the last trace, output this event */
-static enum print_line_t
+static void
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
- int ret;
if (!data)
- return TRACE_TYPE_HANDLED;
+ return;
last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
if (*last_pid == pid)
- return TRACE_TYPE_HANDLED;
+ return;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
- return TRACE_TYPE_HANDLED;
+ return;
/*
* Context-switch trace line:
@@ -546,33 +559,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
------------------------------------------
*/
- ret = trace_seq_puts(s,
- " ------------------------------------------\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, prev_pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " => ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s,
- "\n ------------------------------------------\n\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " ------------------------------------------\n");
+ print_graph_cpu(s, cpu);
+ print_graph_proc(s, prev_pid);
+ trace_seq_puts(s, " => ");
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, "\n ------------------------------------------\n\n");
}
static struct ftrace_graph_ret_entry *
@@ -646,175 +638,122 @@ get_return_for_leaf(struct trace_iterator *iter,
return next;
}
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
- return trace_seq_printf(s, "%5lu.%06lu | ",
- (unsigned long)t, usecs_rem);
+ trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
}
-static enum print_line_t
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
- int ret;
struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
- return TRACE_TYPE_UNHANDLED;
+ return;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, " | ");
}
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
}
/* No overhead */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
- ret = trace_seq_puts(s, "==========>");
+ trace_seq_puts(s, "==========>");
else
- ret = trace_seq_puts(s, "<==========");
+ trace_seq_puts(s, "<==========");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_putc(s, '\n');
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ trace_seq_putc(s, '\n');
}
-enum print_line_t
+void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
- char msecs_str[21];
+ char usecs_str[21];
char nsecs_str[5];
- int ret, len;
+ int len;
int i;
- sprintf(msecs_str, "%lu", (unsigned long) duration);
+ sprintf(usecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
- ret = trace_seq_printf(s, "%s", msecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s", usecs_str);
- len = strlen(msecs_str);
+ len = strlen(usecs_str);
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
- ret = trace_seq_printf(s, ".%s", nsecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, ".%s", nsecs_str);
len += strlen(nsecs_str);
}
- ret = trace_seq_puts(s, " us ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = len; i < 7; i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags)
{
- int ret = -1;
-
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return TRACE_TYPE_HANDLED;
+ return;
/* No real adata, just filling the column with spaces */
switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
case FLAGS_FILL_FULL:
- ret = trace_seq_puts(s, " | ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " | ");
+ return;
case FLAGS_FILL_START:
- ret = trace_seq_puts(s, " ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " ");
+ return;
case FLAGS_FILL_END:
- ret = trace_seq_puts(s, " |");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " |");
+ return;
}
/* Signal a overhead of time execution to the output */
- if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- ret = trace_seq_puts(s, "! ");
- /* Duration exceeded 10 msecs */
- else if (duration > 10000ULL)
- ret = trace_seq_puts(s, "+ ");
- }
-
- /*
- * The -1 means we either did not exceed the duration tresholds
- * or we dont want to print out the overhead. Either way we need
- * to fill out the space.
- */
- if (ret == -1)
- ret = trace_seq_puts(s, " ");
-
- /* Catching here any failure happenned above */
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_print_graph_duration(duration, s);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_puts(s, "| ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
+ trace_seq_printf(s, "%c ", trace_find_mark(duration));
+ else
+ trace_seq_puts(s, " ");
- return TRACE_TYPE_HANDLED;
+ trace_print_graph_duration(duration, s);
+ trace_seq_puts(s, "| ");
}
/* Case of a leaf function on its call entry */
@@ -828,7 +767,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- int ret;
int i;
graph_ret = &ret_entry->ret;
@@ -854,22 +792,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -879,7 +810,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
- int ret;
int i;
if (data) {
@@ -895,19 +825,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- if (!ret)
+ trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+
+ if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
/*
@@ -917,62 +843,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_NO_CONSUME;
}
-static enum print_line_t
+static void
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
- int ret;
/* Pid */
- if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ verif_pid(s, ent->pid, cpu, data);
- if (type) {
+ if (type)
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return 0;
+ return;
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, ent->pid);
+ trace_seq_puts(s, " | ");
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
- ret = print_graph_lat_fmt(s, ent);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
- return 0;
+ return;
}
/*
@@ -1090,8 +997,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
@@ -1124,7 +1030,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
- int ret;
int i;
if (check_irq_return(iter, flags, trace->depth))
@@ -1150,20 +1055,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
}
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Closing brace */
- for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/*
* If the return function does not have a matching entry,
@@ -1172,30 +1071,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* belongs to, write out the function name. Always do
* that if the funcgraph-tail option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
- ret = trace_seq_puts(s, "}\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- } else {
- ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
/* Overrun */
- if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
- ret = trace_seq_printf(s, " (Overruns: %lu)\n",
- trace->overrun);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN)
+ trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
- cpu, pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1212,26 +1101,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (data)
depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
- for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/* The comment */
- ret = trace_seq_puts(s, "/* ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
case TRACE_BPRINT:
@@ -1254,17 +1135,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return ret;
}
+ if (trace_seq_has_overflowed(s))
+ goto out;
+
/* Strip ending newline */
- if (s->buffer[s->len - 1] == '\n') {
- s->buffer[s->len - 1] = '\0';
- s->len--;
+ if (s->buffer[s->seq.len - 1] == '\n') {
+ s->buffer[s->seq.len - 1] = '\0';
+ s->seq.len--;
}
- ret = trace_seq_puts(s, " */\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " */\n");
+ out:
+ return trace_handle_return(s);
}
@@ -1371,35 +1253,35 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
print_lat_header(s, flags);
/* 1st line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " TIME ");
+ seq_puts(s, " TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " CPU");
+ seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " TASK/PID ");
+ seq_puts(s, " TASK/PID ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " DURATION ");
- seq_printf(s, " FUNCTION CALLS\n");
+ seq_puts(s, " DURATION ");
+ seq_puts(s, " FUNCTION CALLS\n");
/* 2nd line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " | | ");
+ seq_puts(s, " | | ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " | | ");
- seq_printf(s, " | | | |\n");
+ seq_puts(s, " | | ");
+ seq_puts(s, " | | | |\n");
}
-void print_graph_headers(struct seq_file *s)
+static void print_graph_headers(struct seq_file *s)
{
print_graph_headers_flags(s, tracer_flags.val);
}
@@ -1426,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
struct fgraph_data *data;
+ gfp_t gfpflags;
int cpu;
iter->private = NULL;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ /* We can be called in atomic context via ftrace_dump() */
+ gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+
+ data = kzalloc(sizeof(*data), gfpflags);
if (!data)
goto out_err;
- data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+ data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
if (!data->cpu_data)
goto out_err_free;
@@ -1495,6 +1381,7 @@ static struct trace_event graph_trace_ret_event = {
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
+ .update_thresh = graph_trace_update_thresh,
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
@@ -1548,12 +1435,12 @@ static const struct file_operations graph_depth_fops = {
.llseek = generic_file_llseek,
};
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
{
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("max_graph_depth", 0644, d_tracer,
@@ -1561,7 +1448,7 @@ static __init int init_graph_debugfs(void)
return 0;
}
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
* Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
-#include <linux/fs.h>
#include "trace.h"
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
- if (!trace_empty(&iter))
- trace_find_next_entry_inc(&iter);
- while (!trace_empty(&iter)) {
+
+ while (trace_find_next_entry_inc(&iter)) {
if (!cnt)
kdb_printf("---------------------------------\n");
cnt++;
- if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ if (!skip_lines) {
print_trace_line(&iter);
- if (!skip_lines)
trace_printk_seq(&iter.seq);
- else
+ } else {
skip_lines--;
+ }
+
if (KDB_FLAG(CMD_INTERRUPT))
goto out;
}
@@ -86,9 +88,12 @@ out:
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- for_each_tracing_cpu(cpu)
- if (iter.buffer_iter[cpu])
+ for_each_tracing_cpu(cpu) {
+ if (iter.buffer_iter[cpu]) {
ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ iter.buffer_iter[cpu] = NULL;
+ }
+ }
}
/*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
static __init int kdb_ftrace_register(void)
{
- kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
- "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..d0ce590f06e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size)
#define fetch_file_offset_string_size NULL
/* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
- is_return, true);
+ is_return, true,
+ kprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -826,7 +827,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
int i;
- seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
ftrace_event_name(&tk->tp.call));
@@ -840,7 +841,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tk->tp.nr_args; i++)
seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1024,27 +1025,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
-
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ goto out;
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1060,33 +1056,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, " <- "))
- goto partial;
+ trace_seq_puts(s, " <- ");
if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
@@ -1144,11 +1135,15 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1158,7 +1153,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
@@ -1175,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1189,7 +1188,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
@@ -1296,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
@@ -1320,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
@@ -1330,23 +1329,23 @@ static __init int init_kprobe_trace(void)
return -EINVAL;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
NULL, &kprobe_events_ops);
/* Event list interface */
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_events' entry\n");
/* Profile interface */
- entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
NULL, &kprobe_profile_ops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_profile' entry\n");
return 0;
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
mmio_reset_data(tr);
}
-static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
{
- int ret = 0;
int i;
resource_size_t start, end;
const struct pci_driver *drv = pci_dev_driver(dev);
- /* XXX: incomplete checks for trace_seq_printf() return value */
- ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
- dev->bus->number, dev->devfn,
- dev->vendor, dev->device, dev->irq);
+ trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
/*
* XXX: is pci_resource_to_user() appropriate, since we are
* supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
*/
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
}
if (drv)
- ret += trace_seq_printf(s, " %s\n", drv->name);
+ trace_seq_printf(s, " %s\n", drv->name);
else
- ret += trace_seq_puts(s, " \n");
- return ret;
+ trace_seq_puts(s, " \n");
}
static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret = 1;
trace_assign_type(field, entry);
rw = &field->rw;
switch (rw->opcode) {
case MMIO_READ:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
(rw->value >> 0) & 0xff, rw->pc, 0);
break;
default:
- ret = trace_seq_puts(s, "rw what?\n");
+ trace_seq_puts(s, "rw what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
trace_assign_type(field, entry);
m = &field->map;
switch (m->opcode) {
case MMIO_PROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
- ret = trace_seq_puts(s, "map what?\n");
+ trace_seq_puts(s, "map what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
*/
#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,37 +20,17 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
-int trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
- int ret;
-
- ret = seq_write(m, s->buffer, len);
-
- /*
- * Only reset this buffer if we successfully wrote to the
- * seq_file buffer.
- */
- if (!ret)
- trace_seq_init(s);
-
- return ret;
-}
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bputs_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -58,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bprint_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_bprintf(s, field->fmt, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -74,266 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct print_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_printf);
-
-/**
- * trace_seq_bitmask - put a list of longs as a bitmask print output
- * @s: trace sequence descriptor
- * @maskp: points to an array of unsigned longs that represent a bitmask
- * @nmaskbits: The number of bits that are valid in @maskp
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * Writes a ASCII representation of a bitmask string into @s.
- */
-int
-trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
- int nmaskbits)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_bitmask);
-
-/**
- * trace_seq_vprintf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+ trace_seq_puts(s, field->buf);
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_seq_vprintf);
-
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-int trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-EXPORT_SYMBOL(trace_seq_putc);
-
-int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
-{
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- int i, j;
-
- if (s->full)
- return 0;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-void *trace_seq_reserve(struct trace_seq *s, size_t len)
-{
- void *ret;
-
- if (s->full)
- return NULL;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return NULL;
- }
-
- ret = s->buffer + s->len;
- s->len += len;
-
- return ret;
-}
-
-int trace_seq_path(struct trace_seq *s, const struct path *path)
-{
- unsigned char *p;
-
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
-
- s->full = 1;
- return 0;
+ return trace_handle_return(s);
}
const char *
@@ -343,7 +66,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
{
unsigned long mask;
const char *str;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
for (i = 0; flag_array[i].name && flags; i++) {
@@ -379,7 +102,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
const struct trace_print_flags *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -390,9 +113,9 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
-
+
trace_seq_putc(p, 0);
return ret;
@@ -405,7 +128,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -416,7 +139,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%llx", val);
trace_seq_putc(p, 0);
@@ -430,7 +153,7 @@ const char *
ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
unsigned int bitmask_size)
{
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
@@ -443,7 +166,7 @@ const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
@@ -454,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
}
EXPORT_SYMBOL(ftrace_print_hex_seq);
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+ size_t el_size)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ const char *prefix = "";
+ void *ptr = (void *)buf;
+
+ trace_seq_putc(p, '{');
+
+ while (ptr < buf + buf_len) {
+ switch (el_size) {
+ case 1:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u8 *)ptr);
+ break;
+ case 2:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u16 *)ptr);
+ break;
+ case 4:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u32 *)ptr);
+ break;
+ case 8:
+ trace_seq_printf(p, "%s0x%llx", prefix,
+ *(u64 *)ptr);
+ break;
+ default:
+ trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+ *(u8 *)ptr);
+ el_size = 1;
+ }
+ prefix = ",";
+ ptr += el_size;
+ }
+
+ trace_seq_putc(p, '}');
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_event *trace_event)
{
@@ -461,7 +228,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- int ret;
event = container_of(trace_event, struct ftrace_event_call, event);
entry = iter->ent;
@@ -472,11 +238,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- return 0;
+ return trace_handle_return(s);
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
@@ -484,18 +248,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
- int ret;
- ret = trace_seq_printf(s, "%s: ", name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", name);
+ trace_seq_vprintf(s, fmt, ap);
- ret = trace_seq_vprintf(s, fmt, ap);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -528,7 +285,7 @@ static inline const char *kretprobed(const char *name)
}
#endif /* CONFIG_KRETPROBES */
-static int
+static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
@@ -539,12 +296,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
-static int
+static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
@@ -555,9 +311,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
sprint_symbol(str, address);
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
#ifndef CONFIG_64BIT
@@ -588,14 +343,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (file) {
ret = trace_seq_path(s, &file->f_path);
if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]",
- ip - vmstart);
+ trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
}
up_read(&mm->mmap_sem);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return !trace_seq_has_overflowed(s);
}
int
@@ -603,7 +358,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
unsigned long sym_flags)
{
struct mm_struct *mm = NULL;
- int ret = 1;
unsigned int i;
if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -622,48 +376,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
unsigned long ip = entry->caller[i];
- if (ip == ULONG_MAX || !ret)
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
break;
- if (ret)
- ret = trace_seq_puts(s, " => ");
+
+ trace_seq_puts(s, " => ");
+
if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- if (ret)
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
continue;
}
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- ret = trace_seq_putc(s, '\n');
+
+ seq_print_user_ip(s, mm, ip, sym_flags);
+ trace_seq_putc(s, '\n');
}
if (mm)
mmput(mm);
- return ret;
+
+ return !trace_seq_has_overflowed(s);
}
int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
- int ret;
-
- if (!ip)
- return trace_seq_putc(s, '0');
+ if (!ip) {
+ trace_seq_putc(s, '0');
+ goto out;
+ }
if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
+ seq_print_sym_offset(s, "%s", ip);
else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
+ seq_print_sym_short(s, "%s", ip);
if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+
+ out:
+ return !trace_seq_has_overflowed(s);
}
/**
@@ -681,7 +432,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
- int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -713,16 +463,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.';
- if (!trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq))
- return 0;
+ trace_seq_printf(s, "%c%c%c",
+ irqs_off, need_resched, hardsoft_irq);
if (entry->preempt_count)
- ret = trace_seq_printf(s, "%x", entry->preempt_count);
+ trace_seq_printf(s, "%x", entry->preempt_count);
else
- ret = trace_seq_putc(s, '.');
+ trace_seq_putc(s, '.');
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static int
@@ -732,14 +481,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
- comm, entry->pid, cpu))
- return 0;
+ trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
}
-static unsigned long preempt_mark_thresh_us = 100;
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+ unsigned long long val; /* unit: nsec */
+ char sym;
+} mark[] = {
+ MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(1000000ULL , '#'), /* 1000 usecs */
+ MARK(100000ULL , '!'), /* 100 usecs */
+ MARK(10000ULL , '+'), /* 10 usecs */
+};
+#undef MARK
+
+char trace_find_mark(unsigned long long d)
+{
+ int i;
+ int size = ARRAY_SIZE(mark);
+
+ for (i = 0; i < size; i++) {
+ if (d >= mark[i].val)
+ break;
+ }
+
+ return (i == size) ? ' ' : mark[i].sym;
+}
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -761,24 +534,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
unsigned long rel_msec = (unsigned long)rel_ts;
- return trace_seq_printf(
- s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
- ns2usecs(iter->ts),
- abs_msec, abs_usec,
- rel_msec, rel_usec);
+ trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+
} else if (verbose && !in_ns) {
- return trace_seq_printf(
- s, "[%016llx] %lld (+%lld): ",
- iter->ts, abs_ts, rel_ts);
+ trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+
} else if (!verbose && in_ns) {
- return trace_seq_printf(
- s, " %4lldus%c: ",
- abs_ts,
- rel_ts > preempt_mark_thresh_us ? '!' :
- rel_ts > 1 ? '+' : ' ');
+ trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ trace_find_mark(rel_ts * NSEC_PER_USEC));
+
} else { /* !verbose && !in_ns */
- return trace_seq_printf(s, " %4lld: ", abs_ts);
+ trace_seq_printf(s, " %4lld: ", abs_ts);
}
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_context(struct trace_iterator *iter)
@@ -788,34 +565,29 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
- int ret;
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (!ret)
- return 0;
- if (trace_flags & TRACE_ITER_IRQ_INFO) {
- ret = trace_print_lat_fmt(s, entry);
- if (!ret)
- return 0;
- }
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
t = ns2usecs(iter->ts);
usec_rem = do_div(t, USEC_PER_SEC);
secs = (unsigned long)t;
- return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
} else
- return trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_seq_printf(s, " %12llu: ", iter->ts);
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
- int ret;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
@@ -835,18 +607,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
- comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
} else {
- ret = lat_print_generic(s, entry, iter->cpu);
+ lat_print_generic(s, entry, iter->cpu);
}
- if (ret)
- ret = lat_print_timestamp(iter, next_ts);
+ lat_print_timestamp(iter, next_ts);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -960,7 +731,7 @@ int register_ftrace_event(struct trace_event *event)
goto out;
} else {
-
+
event->type = next_event_type++;
list = &ftrace_event_list;
}
@@ -1032,10 +803,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
/* TRACE_FN */
@@ -1047,24 +817,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- if (!trace_seq_puts(s, " <-"))
- goto partial;
- if (!seq_print_ip_sym(s,
- field->parent_ip,
- flags))
- goto partial;
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, field->parent_ip, flags);
}
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -1074,12 +836,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
- field->ip,
- field->parent_ip))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -1090,10 +851,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_HEX_FIELD(s, field->ip);
+ SEQ_PUT_HEX_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -1104,10 +865,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_FIELD(s, field->ip);
+ SEQ_PUT_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_fn_funcs = {
@@ -1136,18 +897,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
T = task_state_char(field->next_state);
S = task_state_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
- if (!trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, delim,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -1172,17 +932,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
if (!S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -1210,15 +969,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
+ SEQ_PUT_HEX_FIELD(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD(s, S);
+ SEQ_PUT_HEX_FIELD(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD(s, T);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -1241,14 +1000,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
+ SEQ_PUT_FIELD(s, field->prev_pid);
+ SEQ_PUT_FIELD(s, field->prev_prio);
+ SEQ_PUT_FIELD(s, field->prev_state);
+ SEQ_PUT_FIELD(s, field->next_cpu);
+ SEQ_PUT_FIELD(s, field->next_pid);
+ SEQ_PUT_FIELD(s, field->next_prio);
+ SEQ_PUT_FIELD(s, field->next_state);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_ctx_funcs = {
@@ -1288,23 +1048,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
- if (!trace_seq_puts(s, "<stack trace>\n"))
- goto partial;
+ trace_seq_puts(s, "<stack trace>\n");
for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
- if (!trace_seq_puts(s, " => "))
- goto partial;
- if (!seq_print_ip_sym(s, *p, flags))
- goto partial;
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- }
+ if (trace_seq_has_overflowed(s))
+ break;
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " => ");
+ seq_print_ip_sym(s, *p, flags);
+ trace_seq_putc(s, '\n');
+ }
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_stack_funcs = {
@@ -1325,16 +1081,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!trace_seq_puts(s, "<user stack trace>\n"))
- goto partial;
-
- if (!seq_print_userip_objs(field, s, flags))
- goto partial;
+ trace_seq_puts(s, "<user stack trace>\n");
+ seq_print_userip_objs(field, s, flags);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_user_stack_funcs = {
@@ -1357,19 +1107,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_puts(s, field->str);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1382,16 +1124,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_puts(s, field->str);
- if (!trace_seq_puts(s, field->str))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bputs_funcs = {
@@ -1415,19 +1151,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_puts(s, ": "))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_bprintf(s, field->fmt, field->buf);
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1440,16 +1168,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_bprintf(s, field->fmt, field->buf);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bprint_funcs = {
@@ -1471,16 +1193,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_printf(s, ": %s", field->buf);
- if (!trace_seq_printf(s, ": %s", field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1490,13 +1206,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(&iter->seq);
}
static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,21 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
+#define SEQ_PUT_FIELD(s, x) \
+ trace_seq_putmem(s, &(x), sizeof(x))
+
+#define SEQ_PUT_HEX_FIELD(s, x) \
+ trace_seq_putmem_hex(s, &(x), sizeof(x))
#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
*
*/
#include <linux/seq_file.h>
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/ftrace.h>
@@ -15,7 +14,6 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/fs.h>
#include "trace.h"
@@ -305,7 +303,7 @@ static int t_show(struct seq_file *m, void *v)
seq_puts(m, "\\t");
break;
case '\\':
- seq_puts(m, "\\");
+ seq_putc(m, '\\');
break;
case '"':
seq_puts(m, "\\\"");
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
- return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ return !trace_seq_has_overflowed(s); \
} \
const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
int len = *(u32 *)data >> 16;
if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
+ trace_seq_printf(s, " %s=(fault)", name);
else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
+ trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+ return !trace_seq_has_overflowed(s);
}
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
@@ -354,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
/* Recursive argument parser */
static int parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, bool is_return, bool is_kprobe)
+ struct fetch_param *f, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
unsigned long param;
long offset;
char *tmp;
int ret = 0;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
switch (arg[0]) {
case '$':
ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -445,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
dprm->fetch_size = get_fetch_size_function(t,
dprm->fetch, ftbl);
ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
- is_kprobe);
+ is_kprobe, ftbl);
if (ret)
kfree(dprm);
else {
@@ -503,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
/* String length checking wrapper */
int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe)
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
const char *t;
int ret;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
return -ENOSPC;
@@ -533,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
}
parg->offset = *size;
*size += parg->type->size;
- ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+ is_kprobe, ftbl);
if (ret >= 0 && t != NULL)
ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..ab283e146b70 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define FETCH_TYPE_STRING 0
#define FETCH_TYPE_STRSIZE 1
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
-
#ifdef CONFIG_KPROBE_EVENT
struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
}
extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe);
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl);
extern int traceprobe_conflict_field_name(const char *name,
struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
*
*/
#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
@@ -14,122 +12,26 @@
#include "trace.h"
-static struct trace_array *ctx_trace;
-static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
-static int sched_stopped;
-
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
static void
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
-
- local_irq_restore(flags);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu, pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(current);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_wakeup_trace(ctx_trace, wakee, current,
- flags, pc);
-
- local_irq_restore(flags);
}
static int tracing_sched_register(void)
@@ -197,51 +99,3 @@ void tracing_stop_cmdline_record(void)
{
tracing_stop_sched_switch();
}
-
-/**
- * tracing_start_sched_switch_record - start tracing context switches
- *
- * Turns on context switch tracing for a tracer.
- */
-void tracing_start_sched_switch_record(void)
-{
- if (unlikely(!ctx_trace)) {
- WARN_ON(1);
- return;
- }
-
- tracing_start_sched_switch();
-
- mutex_lock(&sched_register_mutex);
- tracer_enabled++;
- mutex_unlock(&sched_register_mutex);
-}
-
-/**
- * tracing_stop_sched_switch_record - start tracing context switches
- *
- * Turns off context switch tracing for a tracer.
- */
-void tracing_stop_sched_switch_record(void)
-{
- mutex_lock(&sched_register_mutex);
- tracer_enabled--;
- WARN_ON(tracer_enabled < 0);
- mutex_unlock(&sched_register_mutex);
-
- tracing_stop_sched_switch();
-}
-
-/**
- * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
- * @tr: trace array pointer to assign
- *
- * Some tracers might want to record the context switches in their
- * trace. This function lets those tracers assign the trace array
- * to use.
- */
-void tracing_sched_switch_assign_trace(struct trace_array *tr)
-{
- ctx_trace = tr;
-}
-
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
* Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
@@ -365,6 +363,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
wakeup_current_cpu = cpu;
}
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
static void notrace
probe_wakeup_sched_switch(void *ignore,
struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
tracing_start();
/* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
trace->reset(tr);
tracing_start();
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
#endif
#ifdef CONFIG_SCHED_TRACER
+
+struct wakeup_test_data {
+ struct completion is_ready;
+ int go;
+};
+
static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
};
- struct completion *x = data;
+ struct wakeup_test_data *x = data;
sched_setattr(current, &attr);
/* Make it know we have a new prio */
- complete(x);
+ complete(&x->is_ready);
/* now go to sleep and let the test wake us up */
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!x->go) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
- complete(x);
+ complete(&x->is_ready);
+
+ set_current_state(TASK_INTERRUPTIBLE);
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
- /*
- * This will likely be the system top priority
- * task, do short sleeps to let others run.
- */
- msleep(100);
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
+
return 0;
}
-
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tr->max_latency;
struct task_struct *p;
- struct completion is_ready;
+ struct wakeup_test_data data;
unsigned long count;
int ret;
- init_completion(&is_ready);
+ memset(&data, 0, sizeof(data));
+
+ init_completion(&data.is_ready);
/* create a -deadline thread */
- p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
+ p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
if (IS_ERR(p)) {
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
return -1;
}
/* make sure the thread is running at -deadline policy */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* start the tracing */
ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
msleep(100);
}
- init_completion(&is_ready);
+ init_completion(&data.is_ready);
+
+ data.go = 1;
+ /* memory barrier is in the wake_up_process() */
wake_up_process(p);
/* Wait for the task to wake up */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(&tr->trace_buffer, NULL);
- printk("ret = %d\n", ret);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..e694c9f9efa4
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,377 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ *
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
+
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+ if (unlikely(!s->seq.size))
+ trace_seq_init(s);
+}
+
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ int ret;
+
+ __trace_seq_init(s);
+
+ ret = seq_buf_print_seq(m, &s->seq);
+
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer. This lets the caller try again or
+ * do something else with the contents.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ unsigned int save_len = s->seq.len;
+ va_list ap;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ va_start(ap, fmt);
+ seq_buf_vprintf(&s->seq, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ */
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ */
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_bprintf(&s->seq, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+void trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ unsigned int len = strlen(str);
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putmem(&s->seq, str, len);
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putc(&s->seq, c);
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ */
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putmem(&s->seq, mem, len);
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ */
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ unsigned int len)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ /* Each byte is represented by two chars */
+ if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ /* The added spaces can still cause an overflow */
+ seq_buf_putmem_hex(&s->seq, mem, len);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return 0;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ seq_buf_path(&s->seq, path, "\n");
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+ __trace_seq_init(s);
+ return seq_buf_to_user(&s->seq, ubuf, cnt);
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,13 +7,10 @@
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
-#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/magic.h>
#include <asm/setup.h>
@@ -171,8 +168,7 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- if ((current != &init_task &&
- *(end_of_stack(current)) != STACK_END_MAGIC)) {
+ if (task_stack_end_corrupted(current)) {
print_max_stack();
BUG();
}
@@ -331,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
local_irq_enable();
}
-static int trace_lookup_stack(struct seq_file *m, long i)
+static void trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
- return seq_printf(m, "%pS\n", (void *)addr);
+ seq_printf(m, "%pS\n", (void *)addr);
}
static void print_disabled(struct seq_file *m)
@@ -464,7 +460,7 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include "trace_stat.h"
#include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
static void destroy_session(struct stat_session *session)
{
- debugfs_remove(session->file);
+ tracefs_remove(session->file);
__reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
@@ -276,12 +276,12 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
- if (!d_tracing)
+ if (IS_ERR(d_tracing))
return 0;
- stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ stat_dir = tracefs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'trace_stat' entry\n");
return 0;
}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && tracing_stat_init())
return -ENODEV;
- session->file = debugfs_create_file(session->ts->name, 0644,
+ session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
session, &tracing_stat_fops);
if (!session->file)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, ret, syscall;
+ int i, syscall;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
- ret = trace_seq_printf(s, "%s(", entry->name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE) {
- ret = trace_seq_printf(s, "%s ", entry->types[i]);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", entry->types[i]);
+
/* parameter values */
- ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
}
- ret = trace_seq_putc(s, ')');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
+ trace_seq_putc(s, ')');
end:
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
- int ret;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
if (!entry) {
trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ goto out;
}
if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return TRACE_TYPE_UNHANDLED;
}
- ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ out:
+ return trace_handle_return(s);
}
extern char *__bad_type_size(void);
@@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
- rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return -ENOMEM;
+ return;
}
for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
meta->syscall_nr = i;
syscalls_metadata[i] = meta;
}
-
- return 0;
}
-early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
@@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
@@ -586,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size -= sizeof(u32);
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->event.type, regs, &rctx);
+ sys_data->enter_event->event.type, NULL, &rctx);
if (!rec)
return;
@@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
@@ -659,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size -= sizeof(u32);
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->event.type, regs, &rctx);
+ sys_data->exit_event->event.type, NULL, &rctx);
if (!rec)
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3c9b97e6b1f4..d60fe62ec4fa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
DEFINE_FETCH_file_offset(string_size)
/* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
- tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
return tu;
error:
@@ -536,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
- is_return, false);
+ is_return, false,
+ uprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -553,8 +553,7 @@ error:
return ret;
fail_address_parse:
- if (inode)
- iput(inode);
+ iput(inode);
pr_info("Failed to parse address or file.\n");
@@ -607,7 +606,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -853,16 +852,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[1], entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -870,14 +867,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
struct probe_arg *parg = &tu->tp.args[i];
if (!parg->type->print(s, parg->name, data + parg->offset, entry))
- goto partial;
+ goto out;
}
- if (trace_seq_puts(s, "\n"))
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
@@ -1010,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
return true;
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
- if (event->hw.tp_target->mm == mm)
+ if (event->hw.target->mm == mm)
return true;
}
@@ -1020,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
static inline bool
uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
{
- return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+ return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1028,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
bool done;
write_lock(&tu->filter.rwlock);
- if (event->hw.tp_target) {
+ if (event->hw.target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
- (event->hw.tp_target->flags & PF_EXITING) ||
+ (event->hw.target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
@@ -1051,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
int err;
write_lock(&tu->filter.rwlock);
- if (event->hw.tp_target) {
+ if (event->hw.target) {
/*
* event->parent != NULL means copy_process(), we can avoid
* uprobe_apply(). current->mm must be probed and we can rely
@@ -1116,7 +1112,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (hlist_empty(head))
goto out;
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
goto out;
@@ -1292,7 +1288,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
@@ -1326,7 +1322,7 @@ static __init int init_uprobe_trace(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1b1327..975cb49e32bf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,
struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
- struct timespec uptime, ts;
cputime_t utime, stime, utimescaled, stimescaled;
- u64 ac_etime;
+ u64 delta;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
- /* calculate task elapsed time in timespec */
- do_posix_clock_monotonic_gettime(&uptime);
- ts = timespec_sub(uptime, tsk->start_time);
- /* rebase elapsed time to usec (should never be negative) */
- ac_etime = timespec_to_ns(&ts);
- do_div(ac_etime, NSEC_PER_USEC);
- stats->ac_etime = ac_etime;
- stats->ac_btime = get_seconds() - ts.tv_sec;
+ /* calculate task elapsed time in nsec */
+ delta = ktime_get_ns() - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime = delta;
+ /* Convert to seconds for btime */
+ do_div(delta, USEC_PER_SEC);
+ stats->ac_btime = get_seconds() - delta;
if (thread_group_leader(tsk)) {
stats->ac_exitcode = tsk->exit_code;
if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 394f70b17162..9586b670a5b2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
void user_return_notifier_register(struct user_return_notifier *urn)
{
set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
- hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
+ hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
}
EXPORT_SYMBOL_GPL(user_return_notifier_register);
@@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
void user_return_notifier_unregister(struct user_return_notifier *urn)
{
hlist_del(&urn->link);
- if (hlist_empty(&__get_cpu_var(return_notifier_list)))
+ if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
}
EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .proc_inum = PROC_USER_INIT_INO,
+ .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+ .ns.ops = &userns_operations,
+#endif
+ .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
- ret = proc_alloc_inum(&ns->proc_inum);
+ ret = ns_alloc_inum(&ns->ns);
if (ret) {
kmem_cache_free(user_ns_cachep, ns);
return ret;
}
+ ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
set_cred_user_ns(new, ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
@@ -526,21 +533,21 @@ static void m_stop(struct seq_file *seq, void *v)
return;
}
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
.start = uid_m_start,
.stop = m_stop,
.next = m_next,
.show = uid_m_show,
};
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
.start = gid_m_start,
.stop = m_stop,
.next = m_next,
.show = gid_m_show,
};
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
.start = projid_m_start,
.stop = m_stop,
.next = m_next,
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-
-static DEFINE_MUTEX(id_map_mutex);
-
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
/*
- * The id_map_mutex serializes all writes to any given map.
+ * The userns_state_mutex serializes all writes to any given map.
*
* Any map is only ever written once.
*
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* order and smp_rmb() is guaranteed that we don't have crazy
* architectures returning stale data.
*/
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
ret = -EPERM;
/* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!page)
goto out;
- /* Only allow <= page size writes at the beginning of the file */
+ /* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*ppos = count;
ret = count;
out:
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
if (page)
free_page(page);
return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
- /* Allow mapping to your own filesystem ids */
- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ const struct cred *cred = file->f_cred;
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->fsuid))
+ if (uid_eq(uid, cred->euid))
return true;
} else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, file->f_cred->fsgid))
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
return true;
}
}
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
-static void *userns_get(struct task_struct *task)
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&userns_state_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+ mutex_unlock(&userns_state_mutex);
+
+ return allowed;
+}
+
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
+static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
user_ns = get_user_ns(__task_cred(task)->user_ns);
rcu_read_unlock();
- return user_ns;
+ return user_ns ? &user_ns->ns : NULL;
}
-static void userns_put(void *ns)
+static void userns_put(struct ns_common *ns)
{
- put_user_ns(ns);
+ put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, void *ns)
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
- struct user_namespace *user_ns = ns;
+ struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
/* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
return commit_creds(cred);
}
-static unsigned int userns_inum(void *ns)
-{
- struct user_namespace *user_ns = ns;
- return user_ns->proc_inum;
-}
-
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
- .inum = userns_inum,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
if (!ns)
return ERR_PTR(-ENOMEM);
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err) {
kfree(ns);
return ERR_PTR(err);
}
+ ns->ns.ops = &utsns_operations;
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
@@ -84,34 +86,39 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kfree(ns);
}
-static void *utsns_get(struct task_struct *task)
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static struct ns_common *utsns_get(struct task_struct *task)
{
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->uts_ns;
get_uts_ns(ns);
}
- rcu_read_unlock();
+ task_unlock(task);
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void utsns_put(void *ns)
+static void utsns_put(struct ns_common *ns)
{
- put_uts_ns(ns);
+ put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
- struct uts_namespace *ns = new;
+ struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
return 0;
}
-static unsigned int utsns_inum(void *vp)
-{
- struct uts_namespace *ns = vp;
-
- return ns->proc_inum;
-}
-
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
- .inum = utsns_inum,
};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..2316f50b07a4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
@@ -29,8 +24,33 @@
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT 0
+#define SOFT_WATCHDOG_ENABLED_BIT 1
+#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
int __read_mostly watchdog_thresh = 10;
+
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
@@ -47,6 +67,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -62,6 +83,18 @@ static unsigned long soft_lockup_nmi_warn;
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
static int __init hardlockup_panic_setup(char *str)
{
@@ -70,7 +103,9 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_user_enabled = 0;
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -89,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
-/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
-/* */
+
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
@@ -131,7 +165,7 @@ static int get_softlockup_thresh(void)
*/
static unsigned long get_timestamp(void)
{
- return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
+ return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
static void set_sample_period(void)
@@ -185,7 +219,7 @@ void touch_nmi_watchdog(void)
* case we shouldn't have to worry about the watchdog
* going off.
*/
- __raw_get_cpu_var(watchdog_nmi_touch) = true;
+ raw_cpu_write(watchdog_nmi_touch, true);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -194,8 +228,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
void touch_softlockup_watchdog_sync(void)
{
- __raw_get_cpu_var(softlockup_touch_sync) = true;
- __raw_get_cpu_var(watchdog_touch_ts) = 0;
+ __this_cpu_write(softlockup_touch_sync, true);
+ __this_cpu_write(watchdog_touch_ts, 0);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -216,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
- /* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
- return now - touch_ts;
-
+ if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ /* Warn about unreasonable delays. */
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
+ return now - touch_ts;
+ }
return 0;
}
@@ -260,9 +295,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -331,8 +368,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
return HRTIMER_RESTART;
+ }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -345,9 +396,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
}
- printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)
@@ -366,6 +418,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
smp_mb__after_atomic();
}
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -384,7 +437,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
/* kick off the timer for the hardlockup detector */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -404,7 +457,7 @@ static void watchdog_enable(unsigned int cpu)
static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
watchdog_set_prio(SCHED_NORMAL, 0);
hrtimer_cancel(hrtimer);
@@ -436,6 +489,21 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
+
+ /*
+ * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+ * failure path. Check for failures that can occur asynchronously -
+ * for example, when CPUs are on-lined - and shut down the hardware
+ * perf event on each CPU accordingly.
+ *
+ * The only non-obvious place this bit can be cleared is through
+ * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
+ * pr_info here would be too noisy as it would result in a message
+ * every few seconds if the hardlockup was disabled but the softlockup
+ * enabled.
+ */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ watchdog_nmi_disable(cpu);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -451,6 +519,10 @@ static int watchdog_nmi_enable(unsigned int cpu)
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
goto out;
@@ -476,6 +548,18 @@ static int watchdog_nmi_enable(unsigned int cpu)
goto out_save;
}
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
/* skip displaying the same error again */
if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
return PTR_ERR(event);
@@ -484,11 +568,14 @@ static int watchdog_nmi_enable(unsigned int cpu)
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
return PTR_ERR(event);
/* success path */
@@ -511,11 +598,42 @@ static void watchdog_nmi_disable(unsigned int cpu)
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
}
- return;
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}
+
+void watchdog_nmi_enable_all(void)
+{
+ int cpu;
+
+ if (!watchdog_user_enabled)
+ return;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ watchdog_nmi_enable(cpu);
+ put_online_cpus();
+}
+
+void watchdog_nmi_disable_all(void)
+{
+ int cpu;
+
+ if (!watchdog_running)
+ return;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ watchdog_nmi_disable(cpu);
+ put_online_cpus();
}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = {
@@ -531,7 +649,7 @@ static struct smp_hotplug_thread watchdog_threads = {
static void restart_watchdog_hrtimer(void *info)
{
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
int ret;
/*
@@ -546,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info)
HRTIMER_MODE_REL_PINNED);
}
-static void update_timers(int cpu)
+static void update_watchdog(int cpu)
{
/*
* Make sure that perf event counter will adopt to a new
@@ -561,17 +679,17 @@ static void update_timers(int cpu)
watchdog_nmi_enable(cpu);
}
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
{
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
- update_timers(cpu);
+ update_watchdog(cpu);
put_online_cpus();
}
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
{
int err = 0;
@@ -581,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed)
pr_err("Failed to create watchdog threads, disabled\n");
else
watchdog_running = 1;
- } else if (sample_period_changed) {
- update_timers_all_cpus();
+ } else {
+ /*
+ * Enable/disable the lockup detectors or
+ * change the sample period 'on the fly'.
+ */
+ update_watchdog_all_cpus();
}
return err;
@@ -600,39 +722,149 @@ static void watchdog_disable_all_cpus(void)
}
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
+ * Update the run state of the lockup detectors.
+ */
+static int proc_watchdog_update(void)
+{
+ int err = 0;
+
+ /*
+ * Watchdog threads won't be started if they are already active.
+ * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+ * care of this. If those threads are already active, the sample
+ * period will be updated and the lockup detectors will be enabled
+ * or disabled 'on the fly'.
+ */
+ if (watchdog_enabled && watchdog_thresh)
+ err = watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+ return err;
+
+}
+
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ * | | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err, old, new;
+ int *watchdog_param = (int *)table->data;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ /*
+ * If the parameter is being read return the state of the corresponding
+ * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+ * run state of the lockup detectors.
+ */
+ if (!write) {
+ *watchdog_param = (watchdog_enabled & which) != 0;
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ } else {
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (err)
+ goto out;
+
+ /*
+ * There is a race window between fetching the current value
+ * from 'watchdog_enabled' and storing the new value. During
+ * this race window, watchdog_nmi_enable() can sneak in and
+ * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+ * The 'cmpxchg' detects this race and the loop retries.
+ */
+ do {
+ old = watchdog_enabled;
+ /*
+ * If the parameter value is not zero set the
+ * corresponding bit(s), else clear it(them).
+ */
+ if (*watchdog_param)
+ new = old | which;
+ else
+ new = old & ~which;
+ } while (cmpxchg(&watchdog_enabled, old, new) != old);
+
+ /*
+ * Update the run state of the lockup detectors.
+ * Restore 'watchdog_enabled' on failure.
+ */
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_enabled = old;
+ }
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/soft_watchdog
*/
+int proc_soft_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
-int proc_dowatchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old_thresh, old_enabled;
- static DEFINE_MUTEX(watchdog_proc_mutex);
+ int err, old;
mutex_lock(&watchdog_proc_mutex);
- old_thresh = ACCESS_ONCE(watchdog_thresh);
- old_enabled = ACCESS_ONCE(watchdog_user_enabled);
+ old = ACCESS_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
if (err || !write)
goto out;
- set_sample_period();
/*
- * Watchdog threads shouldn't be enabled if they are
- * disabled. The 'watchdog_running' variable check in
- * watchdog_*_all_cpus() function takes care of this.
+ * Update the sample period.
+ * Restore 'watchdog_thresh' on failure.
*/
- if (watchdog_user_enabled && watchdog_thresh)
- err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
- else
- watchdog_disable_all_cpus();
-
- /* Restore old values on failure */
- if (err) {
- watchdog_thresh = old_thresh;
- watchdog_user_enabled = old_enabled;
- }
+ set_sample_period();
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_thresh = old;
out:
mutex_unlock(&watchdog_proc_mutex);
return err;
@@ -643,6 +875,6 @@ void __init lockup_detector_init(void)
{
set_sample_period();
- if (watchdog_user_enabled)
- watchdog_enable_all_cpus(false);
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
- struct list_head list; /* PL: list of all workqueues */
+ struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -265,7 +273,6 @@ struct workqueue_struct {
static struct kmem_cache *pwq_cache;
-static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
@@ -289,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* the per-cpu worker pools */
@@ -325,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -758,13 +766,6 @@ static bool too_many_workers(struct worker_pool *pool)
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
- /*
- * nr_idle and idle_list may disagree if idle rebinding is in
- * progress. Never return %true if idle_list is empty.
- */
- if (list_empty(&pool->idle_list))
- return false;
-
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -850,7 +851,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
return NULL;
/*
@@ -874,35 +875,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
*
- * Set @flags in @worker->flags and adjust nr_running accordingly. If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* spin_lock_irq(pool->lock)
*/
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
- bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
- /*
- * If transitioning into NOT_RUNNING, adjust nr_running and
- * wake up an idle worker as necessary if requested by
- * @wakeup.
- */
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- if (wakeup) {
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- wake_up_worker(pool);
- } else
- atomic_dec(&pool->nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -1232,7 +1220,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
pwq_activate_delayed_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, get_work_color(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1548,7 @@ static void worker_enter_idle(struct worker *worker)
(worker->hentry.next || worker->hentry.pprev)))
return;
- /* can't use worker_set_flags(), also called from start_worker() */
+ /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1602,11 +1590,11 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
{
struct worker *worker;
- worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1658,9 @@ static void worker_detach_from_pool(struct worker *worker,
detach_completion = pool->detach_completion;
mutex_unlock(&pool->attach_mutex);
+ /* clear leftover flags without pool->lock after it is detached */
+ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
if (detach_completion)
complete(detach_completion);
}
@@ -1678,8 +1669,7 @@ static void worker_detach_from_pool(struct worker *worker,
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
- * Create a new worker which is attached to @pool. The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
@@ -1698,7 +1688,7 @@ static struct worker *create_worker(struct worker_pool *pool)
if (id < 0)
goto fail;
- worker = alloc_worker();
+ worker = alloc_worker(pool->node);
if (!worker)
goto fail;
@@ -1724,6 +1714,13 @@ static struct worker *create_worker(struct worker_pool *pool)
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
+ /* start the newly created worker */
+ spin_lock_irq(&pool->lock);
+ worker->pool->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+ spin_unlock_irq(&pool->lock);
+
return worker;
fail:
@@ -1734,44 +1731,6 @@ fail:
}
/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
- worker->pool->nr_workers++;
- worker_enter_idle(worker);
- wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
- }
-
- return worker ? 0 : -ENOMEM;
-}
-
-/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
@@ -1854,8 +1813,8 @@ static void pool_mayday_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
+ spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -1868,8 +1827,8 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock(&pool->lock);
- spin_unlock_irq(&wq_mayday_lock);
+ spin_unlock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -1891,17 +1850,11 @@ static void pool_mayday_timeout(unsigned long __pool)
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
*/
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
- if (!need_to_create_worker(pool))
- return false;
restart:
spin_unlock_irq(&pool->lock);
@@ -1909,23 +1862,10 @@ restart:
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- if (WARN_ON_ONCE(need_to_create_worker(pool)))
- goto restart;
- return true;
- }
-
- if (!need_to_create_worker(pool))
+ if (create_worker(pool) || !need_to_create_worker(pool))
break;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(CREATE_COOLDOWN);
+ schedule_timeout_interruptible(CREATE_COOLDOWN);
if (!need_to_create_worker(pool))
break;
@@ -1933,9 +1873,13 @@ restart:
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
+ /*
+ * This is necessary even after a new worker was just successfully
+ * created as @pool->lock was dropped and the new worker might have
+ * already become busy.
+ */
if (need_to_create_worker(pool))
goto restart;
- return true;
}
/**
@@ -1955,16 +1899,14 @@ restart:
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
- * %false if the pool don't need management and the caller can safely start
- * processing works, %true indicates that the function released pool->lock
- * and reacquired it to perform some management function and that the
- * conditions that the caller verified while holding the lock before
- * calling the function might no longer be true.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- bool ret = false;
/*
* Anyone who successfully grabs manager_arb wins the arbitration
@@ -1977,12 +1919,14 @@ static bool manage_workers(struct worker *worker)
* actual management, the pool may stall indefinitely.
*/
if (!mutex_trylock(&pool->manager_arb))
- return ret;
+ return false;
+ pool->manager = worker;
- ret |= maybe_create_worker(pool);
+ maybe_create_worker(pool);
+ pool->manager = NULL;
mutex_unlock(&pool->manager_arb);
- return ret;
+ return true;
}
/**
@@ -2020,13 +1964,8 @@ __acquires(&pool->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
- /*
- * Ensure we're on the correct CPU. DISASSOCIATED test is
- * necessary to avoid spurious warnings from rescuers servicing the
- * unbound or a disassociated pool.
- */
- WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
- !(pool->flags & POOL_DISASSOCIATED) &&
+ /* ensure we're on the correct CPU */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
@@ -2052,17 +1991,22 @@ __acquires(&pool->lock)
list_del_init(&work->entry);
/*
- * CPU intensive works don't participate in concurrency
- * management. They're the scheduler's responsibility.
+ * CPU intensive works don't participate in concurrency management.
+ * They're the scheduler's responsibility. This takes @worker out
+ * of concurrency management and the next code block will chain
+ * execution of the pending work items.
*/
if (unlikely(cpu_intensive))
- worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Unbound pool isn't concurrency managed and work items should be
- * executed ASAP. Wake up another worker if necessary.
+ * Wake up another worker if necessary. The condition is always
+ * false for normal per-cpu workers since nr_running would always
+ * be >= 1 at this point. This is used to chain execution of the
+ * pending work items for WORKER_NOT_RUNNING workers such as the
+ * UNBOUND and CPU_INTENSIVE ones.
*/
- if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
/*
@@ -2101,9 +2045,10 @@ __acquires(&pool->lock)
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
- * stop_machine.
+ * stop_machine. At the same time, report a quiescent RCU state so
+ * the same condition doesn't freeze RCU.
*/
- cond_resched();
+ cond_resched_rcu_qs();
spin_lock_irq(&pool->lock);
@@ -2218,7 +2163,7 @@ recheck:
}
} while (keep_working(pool));
- worker_set_flags(worker, WORKER_PREP, false);
+ worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
@@ -2305,35 +2250,51 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+ WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
- process_scheduled_works(rescuer);
- spin_unlock_irq(&pool->lock);
-
- worker_detach_from_pool(rescuer, pool);
+ if (!list_empty(scheduled)) {
+ process_scheduled_works(rescuer);
- spin_lock_irq(&pool->lock);
+ /*
+ * The above execution of rescued work items could
+ * have created more to rescue through
+ * pwq_activate_first_delayed() or chained
+ * queueing. Let's put @pwq back on mayday list so
+ * that such back-to-back work items, which may be
+ * being used to relieve memory pressure, don't
+ * incur MAYDAY_INTERVAL delay inbetween.
+ */
+ if (need_to_create_worker(pool)) {
+ spin_lock(&wq_mayday_lock);
+ get_pwq(pwq);
+ list_move_tail(&pwq->mayday_node, &wq->maydays);
+ spin_unlock(&wq_mayday_lock);
+ }
+ }
/*
* Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're holding its lock.
+ * go away while we're still attached to it.
*/
put_pwq(pwq);
/*
- * Leave this pool. If keep_working() is %true, notify a
+ * Leave this pool. If need_more_worker() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
rescuer->pool = NULL;
- spin_unlock(&pool->lock);
- spin_lock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&wq_mayday_lock);
}
spin_unlock_irq(&wq_mayday_lock);
@@ -2353,6 +2314,7 @@ repeat:
struct wq_barrier {
struct work_struct work;
struct completion done;
+ struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
@@ -2401,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ barr->task = current;
/*
* If @target is currently being executed, schedule the
@@ -2778,19 +2741,57 @@ bool flush_work(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(flush_work);
+struct cwt_wait {
+ wait_queue_t wait;
+ struct work_struct *work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+ if (cwait->work != key)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
+ static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(work, is_dwork, &flags);
/*
- * If someone else is canceling, wait for the same event it
- * would be waiting for before retrying.
+ * If someone else is already canceling, wait for it to
+ * finish. flush_work() doesn't work for PREEMPT_NONE
+ * because we may get scheduled between @work's completion
+ * and the other canceling task resuming and clearing
+ * CANCELING - flush_work() will return false immediately
+ * as @work is no longer busy, try_to_grab_pending() will
+ * return -ENOENT as @work is still being canceled and the
+ * other canceling task won't be able to clear CANCELING as
+ * we're hogging the CPU.
+ *
+ * Let's wait for completion using a waitqueue. As this
+ * may lead to the thundering herd problem, use a custom
+ * wake function which matches @work along with exclusive
+ * wait and wakeup.
*/
- if (unlikely(ret == -ENOENT))
- flush_work(work);
+ if (unlikely(ret == -ENOENT)) {
+ struct cwt_wait cwait;
+
+ init_wait(&cwait.wait);
+ cwait.wait.func = cwt_wakefn;
+ cwait.work = work;
+
+ prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (work_is_canceling(work))
+ schedule();
+ finish_wait(&cancel_waitq, &cwait.wait);
+ }
} while (unlikely(ret < 0));
/* tell other tasks trying to grab @work to back off */
@@ -2799,6 +2800,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
flush_work(work);
clear_work_data(work);
+
+ /*
+ * Paired with prepare_to_wait() above so that either
+ * waitqueue_active() is visible here or !work_is_canceling() is
+ * visible there.
+ */
+ smp_mb();
+ if (waitqueue_active(&cancel_waitq))
+ __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
return ret;
}
@@ -2991,324 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
- * following attributes.
- *
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- * id RO int : the associated pool ID
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
- struct workqueue_struct *wq;
- struct device dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
- struct device_attribute *attr, const char *buf,
- size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) != 1 || val <= 0)
- return -EINVAL;
-
- workqueue_set_max_active(wq, val);
- return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
- &dev_attr_per_cpu.attr,
- &dev_attr_max_active.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- rcu_read_lock_sched();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
-
- return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
- struct workqueue_attrs *attrs;
-
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!attrs)
- return NULL;
-
- mutex_lock(&wq->mutex);
- copy_workqueue_attrs(attrs, wq->unbound_attrs);
- mutex_unlock(&wq->mutex);
- return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
- ret = apply_workqueue_attrs(wq, attrs);
- else
- ret = -EINVAL;
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
- mutex_unlock(&wq->mutex);
-
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = cpumask_parse(buf, attrs->cpumask);
- if (!ret)
- ret = apply_workqueue_attrs(wq, attrs);
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int v, ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
- ret = apply_workqueue_attrs(wq, attrs);
- }
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
- __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
- __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
- __ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
- .name = "workqueue",
- .dev_groups = wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
- return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev;
- int ret;
-
- /*
- * Adjusting max_active or creating new pwqs by applyting
- * attributes breaks ordering guarantee. Disallow exposing ordered
- * workqueues.
- */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
- return -EINVAL;
-
- wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
- if (!wq_dev)
- return -ENOMEM;
-
- wq_dev->wq = wq;
- wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
- wq_dev->dev.release = wq_device_release;
-
- /*
- * unbound_attrs are created separately. Suppress uevent until
- * everything is ready.
- */
- dev_set_uevent_suppress(&wq_dev->dev, true);
-
- ret = device_register(&wq_dev->dev);
- if (ret) {
- kfree(wq_dev);
- wq->wq_dev = NULL;
- return ret;
- }
-
- if (wq->flags & WQ_UNBOUND) {
- struct device_attribute *attr;
-
- for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
- ret = device_create_file(&wq_dev->dev, attr);
- if (ret) {
- device_unregister(&wq_dev->dev);
- wq->wq_dev = NULL;
- return ret;
- }
- }
- }
-
- dev_set_uevent_suppress(&wq_dev->dev, false);
- kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
- return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev = wq->wq_dev;
-
- if (!wq->wq_dev)
- return;
-
- wq->wq_dev = NULL;
- device_unregister(&wq_dev->dev);
-}
-#else /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
-#endif /* CONFIG_SYSFS */
-
/**
* free_workqueue_attrs - free a workqueue_attrs
* @attrs: workqueue_attrs to free
@@ -3427,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3458,7 +3165,7 @@ static void put_unbound_pool(struct worker_pool *pool)
return;
/* sanity checks */
- if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ if (WARN_ON(!(pool->cpu < 0)) ||
WARN_ON(!list_empty(&pool->worklist)))
return;
@@ -3524,7 +3231,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool->attrs, attrs)) {
pool->refcnt++;
- goto out_unlock;
+ return pool;
}
}
@@ -3557,12 +3264,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
goto fail;
/* create and start the initial worker */
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
+
return pool;
fail:
if (pool)
@@ -3591,11 +3298,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
- /*
- * Unlink @pwq. Synchronization against wq->mutex isn't strictly
- * necessary on release but do it anyway. It's easier to verify
- * and consistent with the linking path.
- */
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
@@ -3609,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
/*
* If we're the last pwq going away, @wq is already dead and no one
- * is gonna access it anymore. Free it.
+ * is gonna access it anymore. Schedule RCU free.
*/
- if (is_last) {
- free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq);
- }
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
}
/**
@@ -3692,10 +3392,7 @@ static void link_pwq(struct pool_workqueue *pwq)
if (!list_empty(&pwq->pwqs_node))
return;
- /*
- * Set the matching work_color. This is synchronized with
- * wq->mutex to avoid confusing flush_workqueue().
- */
+ /* set the matching work_color */
pwq->work_color = wq->work_color;
/* sync max_active to the current setting */
@@ -3832,7 +3529,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;
- pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +3777,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
- tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+ tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
@@ -4122,7 +3819,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- rescuer = alloc_worker();
+ rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
goto err_destroy;
@@ -4154,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
- list_add(&wq->list, &workqueues);
+ list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
@@ -4210,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
* flushing is complete in case freeze races us.
*/
mutex_lock(&wq_pool_mutex);
- list_del_init(&wq->list);
+ list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
workqueue_sysfs_unregister(wq);
- if (wq->rescuer) {
+ if (wq->rescuer)
kthread_stop(wq->rescuer->task);
- kfree(wq->rescuer);
- wq->rescuer = NULL;
- }
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
- * free the pwqs and wq.
+ * schedule RCU free.
*/
- free_percpu(wq->cpu_pwqs);
- kfree(wq);
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
@@ -4448,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
@@ -4470,8 +4323,6 @@ static void wq_unbind_fn(struct work_struct *work)
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
- WARN_ON_ONCE(cpu != smp_processor_id());
-
mutex_lock(&pool->attach_mutex);
spin_lock_irq(&pool->lock);
@@ -4543,6 +4394,7 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
@@ -4632,7 +4484,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4644,15 +4496,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_pool(pool, pi) {
mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu) {
- spin_lock_irq(&pool->lock);
- pool->flags &= ~POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
-
+ if (pool->cpu == cpu)
rebind_workers(pool);
- } else if (pool->cpu < 0) {
+ else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- }
mutex_unlock(&pool->attach_mutex);
}
@@ -4851,15 +4698,328 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq->unbound_attrs->cpumask));
+ mutex_unlock(&wq->mutex);
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+ return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
+ /*
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
+ */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
+
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
int node, cpu;
- /* determine NUMA pwq table len - highest node id + 1 */
- for_each_node(node)
- wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
if (num_possible_nodes() <= 1)
return;
@@ -4876,7 +5036,7 @@ static void __init wq_numa_init(void)
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
- tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
@@ -4936,7 +5096,7 @@ static int __init init_workqueues(void)
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
- BUG_ON(create_and_start_worker(pool) < 0);
+ BUG_ON(!create_worker(pool));
}
}