diff options
Diffstat (limited to 'kernel')
243 files changed, 29098 insertions, 12085 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..08561f1acd13 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +config LOCK_SPIN_ON_OWNER + def_bool y + depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b6246ce9..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,19 +3,20 @@ # obj-y = fork.o exec_domain.o panic.o \ - cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + cpu.o exit.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o nsproxy.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o smpboot.o + +obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_irq_work.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif # cond_syscall is currently not LTO compatible @@ -27,6 +28,7 @@ obj-y += power/ obj-y += printk/ obj-y += irq/ obj-y += rcu/ +obj-y += livepatch/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -58,7 +60,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -87,6 +88,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ @@ -105,27 +107,11 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into @@ -159,7 +145,7 @@ endif kernel/system_certificates.o: $(obj)/x509_certificate_list quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") targets += $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list diff --git a/kernel/acct.c b/kernel/acct.c index 808a86ff229d..74963d192c5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> +#include <linux/fs_pin.h> /* * These constants control the amount of freespace that suspend and @@ -75,172 +76,186 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + struct fs_pin pin; + atomic_long_t count; + struct rcu_head rcu; + struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; + struct work_struct work; + struct completion done; }; -static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); +static void do_acct_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); - return res; + return acct->active; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void acct_put(struct bsd_acct_struct *p) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); + if (atomic_long_dec_and_test(&p->count)) + kfree_rcu(p, rcu); +} + +static inline struct bsd_acct_struct *to_acct(struct fs_pin *p) +{ + return p ? container_of(p, struct bsd_acct_struct, pin) : NULL; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) +{ + struct bsd_acct_struct *res; +again: + smp_rmb(); + rcu_read_lock(); + res = to_acct(ACCESS_ONCE(ns->bacct)); + if (!res) { + rcu_read_unlock(); + return NULL; } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; - list_add(&acct->list, &acct_list); + if (!atomic_long_inc_not_zero(&res->count)) { + rcu_read_unlock(); + cpu_relax(); + goto again; } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); - spin_lock(&acct_lock); + rcu_read_unlock(); + mutex_lock(&res->lock); + if (res != to_acct(ACCESS_ONCE(ns->bacct))) { + mutex_unlock(&res->lock); + acct_put(res); + goto again; } + return res; +} + +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct = to_acct(pin); + mutex_lock(&acct->lock); + do_acct_process(acct); + schedule_work(&acct->work); + wait_for_completion(&acct->done); + cmpxchg(&acct->ns->bacct, pin, NULL); + mutex_unlock(&acct->lock); + pin_remove(pin); + acct_put(acct); +} + +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); } static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct vfsmount *mnt, *internal; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct; + struct fs_pin *old; + int err; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } - if (!file->f_op->write) { + if (!(file->f_mode & FMODE_CAN_WRITE)) { + kfree(acct); filp_close(file, NULL); return -EIO; } - - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); } - - spin_lock(&acct_lock); - if (ns->bacct == NULL) { - ns->bacct = acct; - acct = NULL; + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; } - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); + file->f_path.mnt = internal; + + atomic_long_set(&acct->count, 1); + init_fs_pin(&acct->pin, acct_pin_kill); + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); + rcu_read_lock(); + old = xchg(&ns->bacct, &acct->pin); + mutex_unlock(&acct->lock); + pin_kill(old); + mnt_drop_write(mnt); + mntput(mnt); return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -261,80 +276,25 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + rcu_read_lock(); + pin_kill(task_active_pid_ns(current)->bacct); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + rcu_read_lock(); + pin_kill(ns->bacct); } /* @@ -376,7 +336,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +349,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ +#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { @@ -420,7 +380,7 @@ static comp2_t encode_comp2_t(u64 value) } #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -429,8 +389,9 @@ static u32 encode_float(u64 value) unsigned exp = 190; unsigned u; - if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -448,120 +409,116 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - mm_segment_t fs; - unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; + u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; + run_time = ktime_get_ns(); + run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); +#if ACCT_VERSION == 3 + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; - /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_btime = get_seconds() - elapsed; #if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 - /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; -#endif -#if ACCT_VERSION==3 - ac.ac_pid = task_tgid_nr_ns(current, ns); - rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); - rcu_read_unlock(); + ac->ac_ahz = AHZ; #endif spin_lock_irq(¤t->sighand->siglock); tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + struct file *file = acct->file; /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * Accounting records are not subject to resource limits. */ - if (!file_start_write_trylock(file)) - goto out; + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + /* - * Kernel segment override to datasegment and write it - * to the accounting file. + * First check to see if there is enough free_space to continue + * the process accounting system. */ - fs = get_fs(); - set_fs(KERNEL_DS); + if (!check_free_space(acct)) + goto out; + + fill_ac(&ac); + /* we really need to bite the bullet and change layout */ + ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); + ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 + /* backward-compatible 16 bit fields */ + ac.ac_uid16 = ac.ac_uid; + ac.ac_gid16 = ac.ac_gid; +#endif +#if ACCT_VERSION == 3 + { + struct pid_namespace *ns = acct->ns; + + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), + ns); + rcu_read_unlock(); + } +#endif /* - * Accounting records are not subject to resource limits. + * Get freeze protection. If the fs is frozen, just skip the write + * as we could deadlock the system otherwise. */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } @@ -578,6 +535,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { @@ -609,34 +567,20 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - return; + for ( ; ns; ns = ns->parent) { + struct bsd_acct_struct *acct = acct_get(ns); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + acct_put(acct); + } } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -649,6 +593,10 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + if (ns->bacct) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } @@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain endtime = ktime_get(); delta = ktime_sub(endtime, starttime); - printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), (long long)ktime_to_ns(delta) >> 10); } diff --git a/kernel/audit.c b/kernel/audit.c index d5a1220c8620..ab5745ddf962 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -501,7 +501,6 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { struct sk_buff *skb; - DECLARE_WAITQUEUE(wait, current); flush_hold_queue(); @@ -516,16 +515,8 @@ static int kauditd_thread(void *dummy) audit_printk_skb(skb); continue; } - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } @@ -1110,7 +1101,7 @@ static void audit_receive(struct sk_buff *skb) } /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(int group) +static int audit_bind(struct net *net, int group) { if (!capable(CAP_AUDIT_READ)) return -EPERM; @@ -1669,7 +1660,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) audit_log_format(ab, " %s=", prefix); CAP_FOR_EACH_U32(i) { audit_log_format(ab, "%08x", - cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + cap->cap[CAP_LAST_U32 - i]); } } diff --git a/kernel/audit.h b/kernel/audit.h index 28067c50fd04..d641f9bb3ed0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -24,12 +24,6 @@ #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> -/* 0 = no checking - 1 = put_count checking - 2 = verbose put_count checking -*/ -#define AUDIT_DEBUG 0 - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -74,9 +68,8 @@ struct audit_cap_data { }; }; -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). +/* When fs/namei.c:getname() is called, we store the pointer in name and bump + * the refcnt in the associated filename struct. * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ @@ -86,7 +79,6 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - bool name_put; /* call __putname()? */ unsigned long ino; dev_t dev; @@ -208,11 +200,6 @@ struct audit_context { }; int fds[2]; struct audit_proctitle proctitle; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif }; extern u32 audit_ever_enabled; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 415072c8e875..71fd1f289885 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -175,9 +175,9 @@ static void insert_hash(struct audit_chunk *chunk) struct fsnotify_mark *entry = &chunk->mark; struct list_head *list; - if (!entry->i.inode) + if (!entry->inode) return; - list = chunk_hash(entry->i.inode); + list = chunk_hash(entry->inode); list_add_rcu(&chunk->hash, list); } @@ -189,7 +189,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { + if (p->mark.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -232,7 +232,7 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { + if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); if (new) free_chunk(new); @@ -259,7 +259,7 @@ static void untag_chunk(struct node *p) goto Fallback; fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -387,7 +387,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { + if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); fsnotify_put_mark(old_entry); @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) } fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -612,7 +612,7 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; + struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 81c94d739e3f..72e1660a79a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -121,7 +121,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -175,7 +175,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b89f7f95d84..84c74d08c62b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; -#if AUDIT_DEBUG == 2 - if (context->put_count + context->ino_count != context->name_count) { - int i = 0; - - pr_err("%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d ino_count=%d" - " [NOT freeing]\n", __FILE__, __LINE__, - context->serial, context->major, context->in_syscall, - context->name_count, context->put_count, - context->ino_count); - list_for_each_entry(n, &context->names_list, list) { - pr_err("names[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } - dump_stack(); - return; - } -#endif -#if AUDIT_DEBUG - context->put_count = 0; - context->ino_count = 0; -#endif - list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); - if (n->name && n->name_put) - final_putname(n->name); + if (n->name) + putname(n->name); if (n->should_free) kfree(n); } @@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif return aname; } @@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) + if (n->name->uptr == uptr) { + n->name->refcnt++; return n->name; + } } return NULL; } @@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name) struct audit_context *context = current->audit_context; struct audit_names *n; - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): ignoring getname(%p)\n", - __FILE__, __LINE__, context->serial, name); - dump_stack(); -#endif + if (!context->in_syscall) return; - } - -#if AUDIT_DEBUG - /* The filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) @@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; - n->name_put = true; name->aname = n; + name->refcnt++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } -/* audit_putname - intercept a putname request - * @name: name to intercept and delay for putname - * - * If we have stored the name from getname in the audit context, - * then we delay the putname until syscall exit. - * Called from include/linux/fs.h:putname(). - */ -void audit_putname(struct filename *name) -{ - struct audit_context *context = current->audit_context; - - BUG_ON(!context); - if (!name->aname || !context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): final_putname(%p)\n", - __FILE__, __LINE__, context->serial, name); - if (context->name_count) { - struct audit_names *n; - int i = 0; - - list_for_each_entry(n, &context->names_list, list) - pr_err("name[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } -#endif - final_putname(name); - } -#if AUDIT_DEBUG - else { - ++context->put_count; - if (context->put_count > context->name_count) { - pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" - " name_count=%d put_count=%d\n", - __FILE__, __LINE__, - context->serial, context->major, - context->in_syscall, name->name, - context->name_count, context->put_count); - dump_stack(); - } - } -#endif -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, if (!name) goto out_alloc; -#if AUDIT_DEBUG - /* The struct filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. @@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - if (!n->name || strcmp(n->name->name, name->name)) + if (n->ino) { + /* valid inode number, use that for the comparison */ + if (n->ino != inode->i_ino || + n->dev != inode->i_sb->s_dev) + continue; + } else if (n->name) { + /* inode number has not been set, check the name */ + if (strcmp(n->name->name, name->name)) + continue; + } else + /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ @@ -1882,44 +1810,11 @@ out_alloc: n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; - /* unfortunately, while we may have a path name to record with the - * inode, we can't always rely on the string lasting until the end of - * the syscall so we need to create our own copy, it may fail due to - * memory allocation issues, but we do our best */ if (name) { - /* we can't use getname_kernel() due to size limits */ - size_t len = strlen(name->name) + 1; - struct filename *new = __getname(); - - if (unlikely(!new)) - goto out; - - if (len <= (PATH_MAX - sizeof(*new))) { - new->name = (char *)(new) + sizeof(*new); - new->separate = false; - } else if (len <= PATH_MAX) { - /* this looks odd, but is due to final_putname() */ - struct filename *new2; - - new2 = kmalloc(sizeof(*new2), GFP_KERNEL); - if (unlikely(!new2)) { - __putname(new); - goto out; - } - new2->name = (char *)new; - new2->separate = true; - new = new2; - } else { - /* we should never get here, but let's be safe */ - __putname(new); - goto out; - } - strlcpy((char *)new->name, name->name, len); - new->uptr = NULL; - new->aname = n; - n->name = new; - n->name_put = true; + n->name = name; + name->refcnt++; } + out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; @@ -1934,6 +1829,11 @@ out: audit_copy_inode(n, dentry, inode); } +void __audit_file(const struct file *file) +{ + __audit_inode(NULL, file->f_path.dentry, 0); +} + /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent @@ -1965,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent, /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name || n->type != AUDIT_TYPE_PARENT) + if (!n->name || + (n->type != AUDIT_TYPE_PARENT && + n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + !audit_compare_dname_path(dname, + n->name->name, n->name_len)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = AUDIT_TYPE_PARENT; found_parent = n; break; } @@ -1978,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent, /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ - if (!n->name || n->type != type) - continue; - - /* if we found a parent, make sure this one is a child of it */ - if (found_parent && (n->name != found_parent->name)) + if (!n->name || + (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (!strcmp(dname, n->name->name) || @@ -1990,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = type; found_child = n; break; } @@ -2014,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - /* don't call __putname() */ - found_child->name_put = false; + found_child->name->refcnt++; } } + if (inode) audit_copy_inode(found_child, dentry, inode); else @@ -2400,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = current->audit_context; struct cpu_vfs_cap_data vcaps; - struct dentry *dentry; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) @@ -2410,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - dentry = dget(bprm->file->f_dentry); - get_vfs_caps_from_disk(dentry, &vcaps); - dput(dentry); + get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; @@ -2434,7 +2335,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * @new: the new credentials * @old: the old (current) credentials * - * Record the aguments userspace sent to sys_capset for later printing by the + * Record the arguments userspace sent to sys_capset for later printing by the * audit system if applicable */ void __audit_log_capset(const struct cred *new, const struct cred *old) diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> -#include <linux/page_cgroup.h> #include <linux/log2.h> #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 000000000000..e6983be12bd3 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1,2 @@ +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..8a6616583f38 --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,156 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/mm.h> + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size, array_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* check round_up into zero and u32 overflow */ + if (elem_size == 0 || + attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + return ERR_PTR(-ENOMEM); + + array_size = sizeof(*array) + attr->max_entries * elem_size; + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elements already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static const struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list array_type __read_mostly = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&array_type); + return 0; +} +late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 000000000000..4139a0f8b558 --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,674 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ + +#include <linux/filter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/moduleloader.h> +#include <asm/unaligned.h> +#include <linux/bpf.h> + +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + + return NULL; +} + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog_aux *aux; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->aux = aux; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->aux from fp_old around in the new + * reallocated structure. + */ + fp_old->aux = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->aux); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + +#ifdef CONFIG_BPF_JIT +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_memfree(hdr); +} +#endif /* CONFIG_BPF_JIT */ + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +/** + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions + * + * Decode and execute eBPF instructions. + */ +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, + }; + void *ptr; + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = DST; + DST = do_div(tmp, SRC); + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + tmp = DST; + DST = do_div(tmp, IMM); + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) + return 0; + do_div(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + do_div(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} + +/** + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via BPF_PROG_RUN() macro + */ +void bpf_prog_select_runtime(struct bpf_prog *fp) +{ + fp->bpf_func = (void *) __bpf_prog_run; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); + +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_prog_aux *aux; + + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); +} + +/* Free internal BPF program */ +void bpf_prog_free(struct bpf_prog *fp) +{ + struct bpf_prog_aux *aux = fp->aux; + + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); +} +EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* Weak definitions of helper functions in case we don't have bpf syscall. */ +const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; +const struct bpf_func_proto bpf_map_update_elem_proto __weak; +const struct bpf_func_proto bpf_map_delete_elem_proto __weak; + +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..83c209d9b17a --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,367 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + /* prevent zero size kmalloc and check for u32 overflow */ + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + goto free_htab; + + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static const struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_type __read_mostly = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&htab_type); + return 0; +} +late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..bd7f5988ed9c --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,113 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/rcupdate.h> +#include <linux/random.h> +#include <linux/smp.h> + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +const struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +const struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +const struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..3bae6c591914 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,621 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/license.h> +#include <linux/filter.h> +#include <linux/version.h> + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value, *ptr; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->type = type; + return 0; + } + } + + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* fixup insn->imm field of bpf_call instructions: + * if (insn->imm == BPF_FUNC_map_lookup_elem) + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; + * else if (insn->imm == BPF_FUNC_map_update_elem) + * insn->imm = bpf_map_update_elem - __bpf_call_base; + * else ... + * + * this function is called after eBPF program passed verification + */ +static void fixup_bpf_calls(struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + int i; + + for (i = 0; i < prog->len; i++) { + struct bpf_insn *insn = &prog->insnsi[i]; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + /* we reach here when program has bpf_call instructions + * and it passed bpf_check(), means that + * ops->get_func_proto must have been supplied, check it + */ + BUG_ON(!prog->aux->ops->get_func_proto); + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + BUG_ON(!fn->func); + insn->imm = fn->func - __bpf_call_base; + } + } +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} +EXPORT_SYMBOL_GPL(bpf_prog_put); + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} +EXPORT_SYMBOL_GPL(bpf_prog_get); + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD kern_version + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + err = bpf_check(&prog, attr); + if (err < 0) + goto free_used_maps; + + /* fixup BPF_CALL->imm field */ + fixup_bpf_calls(prog); + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..47dcd3aa6e23 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,2146 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/netlink.h> +#include <linux/file.h> +#include <linux/vmalloc.h> + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* register spilled into stack */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ + struct verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + } + verbose("\n"); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + /* save register state */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + state->regs[value_regno]; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { + /* regular write of data into stack */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + u8 *slot_type; + int i; + + slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + + if (slot_type[0] == STACK_SPILL) { + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < BPF_REG_SIZE; i++) { + if (slot_type[i] != STACK_SPILL) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; + return 0; + } else { + for (i = 0; i < size; i++) { + if (slot_type[i] != STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_DONTCARE) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_ANYTHING) + return 0; + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + return true; + default: + return false; + } +} + +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (!may_access_skb(env->prog->type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + +/* non-recursive DFS pseudo code + * 1 procedure DFS-iterative(G,v): + * 2 label v as discovered + * 3 let S be a stack + * 4 S.push(v) + * 5 while S is not empty + * 6 t <- S.pop() + * 7 if t is what we're looking for: + * 8 return t + * 9 for all edges e in G.adjacentEdges(t) do + * 10 if edge e is already labelled + * 11 continue with the next edge + * 12 w <- G.adjacentVertex(t,e) + * 13 if vertex w is not discovered and not explored + * 14 label e as tree-edge + * 15 label w as discovered + * 16 S.push(w) + * 17 continue at 5 + * 18 else if vertex w is discovered + * 19 label e as back-edge + * 20 else + * 21 // vertex w is explored + * 22 label e as forward- or cross-edge + * 23 label t as explored + * 24 S.pop() + * + * convention: + * 0x10 - discovered + * 0x11 - discovered and fall-through edge labelled + * 0x12 - discovered and fall-through and branch edges labelled + * 0x20 - explored + */ + +enum { + DISCOVERED = 0x10, + EXPLORED = 0x20, + FALLTHROUGH = 1, + BRANCH = 2, +}; + +#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) + +static int *insn_stack; /* stack of insns to process */ +static int cur_stack; /* current stack index */ +static int *insn_state; + +/* t, w, e - match pseudo-code above: + * t - index of current instruction + * w - next instruction + * e - edge + */ +static int push_insn(int t, int w, int e, struct verifier_env *env) +{ + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) + return 0; + + if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) + return 0; + + if (w < 0 || w >= env->prog->len) { + verbose("jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + if (e == BRANCH) + /* mark branch target for state pruning */ + env->explored_states[w] = STATE_LIST_MARK; + + if (insn_state[w] == 0) { + /* tree-edge */ + insn_state[t] = DISCOVERED | e; + insn_state[w] = DISCOVERED; + if (cur_stack >= env->prog->len) + return -E2BIG; + insn_stack[cur_stack++] = w; + return 1; + } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose("back-edge from insn %d to %d\n", t, w); + return -EINVAL; + } else if (insn_state[w] == EXPLORED) { + /* forward- or cross-edge */ + insn_state[t] = DISCOVERED | e; + } else { + verbose("insn state internal bug\n"); + return -EFAULT; + } + return 0; +} + +/* non-recursive depth-first-search to detect loops in BPF program + * loop == back-edge in directed graph + */ +static int check_cfg(struct verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret = 0; + int i, t; + + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_state) + return -ENOMEM; + + insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_stack) { + kfree(insn_state); + return -ENOMEM; + } + + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ + insn_stack[0] = 0; /* 0 is the first instruction */ + cur_stack = 1; + +peek_stack: + if (cur_stack == 0) + goto check_state; + t = insn_stack[cur_stack - 1]; + + if (BPF_CLASS(insns[t].code) == BPF_JMP) { + u8 opcode = BPF_OP(insns[t].code); + + if (opcode == BPF_EXIT) { + goto mark_explored; + } else if (opcode == BPF_CALL) { + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } else if (opcode == BPF_JA) { + if (BPF_SRC(insns[t].code) != BPF_K) { + ret = -EINVAL; + goto err_free; + } + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, + FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + /* tell verifier to check for equivalent states + * after every call and jump + */ + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; + } else { + /* conditional jump with two edges */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + } else { + /* all other non-branch instructions with single + * fall-through edge + */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + +mark_explored: + insn_state[t] = EXPLORED; + if (cur_stack-- <= 0) { + verbose("pop stack internal bug\n"); + ret = -EFAULT; + goto err_free; + } + goto peek_stack; + +check_state: + for (i = 0; i < insn_cnt; i++) { + if (insn_state[i] != EXPLORED) { + verbose("unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + } + ret = 0; /* cfg looks good */ + +err_free: + kfree(insn_state); + kfree(insn_stack); + return ret; +} + +/* compare two verifier states + * + * all states stored in state_list are known to be valid, since + * verifier reached 'bpf_exit' instruction through them + * + * this function is called when verifier exploring different branches of + * execution popped from the state stack. If it sees an old state that has + * more strict register state and more strict stack state then this execution + * branch doesn't need to be explored further, since verifier already + * concluded that more strict state leads to valid finish. + * + * Therefore two states are equivalent if register state is more conservative + * and explored stack state is more conservative than the current one. + * Example: + * explored current + * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) + * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) + * + * In other words if current stack state (one being explored) has more + * valid slots than old one that already passed validation, it means + * the verifier can stop exploring and conclude that current state is valid too + * + * Similarly with registers. If explored state has register type as invalid + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + if (memcmp(&old->regs[i], &cur->regs[i], + sizeof(old->regs[0])) != 0) { + if (old->regs[i].type == NOT_INIT || + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) + continue; + return false; + } + } + + for (i = 0; i < MAX_BPF_STACK; i++) { + if (old->stack_slot_type[i] == STACK_INVALID) + continue; + if (old->stack_slot_type[i] != cur->stack_slot_type[i]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + sizeof(old->spilled_regs[0]))) + /* when explored and current stack slot types are + * the same, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * but current path has stored: + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + else + continue; + } + return true; +} + +static int is_state_visited(struct verifier_env *env, int insn_idx) +{ + struct verifier_state_list *new_sl; + struct verifier_state_list *sl; + + sl = env->explored_states[insn_idx]; + if (!sl) + /* this 'insn_idx' instruction wasn't marked, so we will not + * be doing state search here + */ + return 0; + + while (sl != STATE_LIST_MARK) { + if (states_equal(&sl->state, &env->cur_state)) + /* reached equivalent register/stack state, + * prune the search + */ + return 1; + sl = sl->next; + } + + /* there were no equivalent states, remember current one. + * technically the current state is not proven to be safe yet, + * but it will either reach bpf_exit (which means it's safe) or + * it will be rejected. Since there are no loops, we won't be + * seeing this 'insn_idx' instruction again on the way to bpf_exit + */ + new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + if (!new_sl) + return -ENOMEM; + + /* add new state to the head of linked list */ + memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + new_sl->next = env->explored_states[insn_idx]; + env->explored_states[insn_idx] = new_sl; + return 0; +} + +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + err = is_state_visited(env, insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (log_level) { + if (do_print_state) + verbose("\nfrom %d to %d: safe\n", + prev_insn_idx, insn_idx); + else + verbose("%d: safe\n", insn_idx); + } + goto process_bpf_exit; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + if (BPF_SIZE(insn->code) != BPF_W) { + insn_idx++; + continue; + } + + if (insn->imm == 0) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + +process_bpf_exit: + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + +static void free_states(struct verifier_env *env) +{ + struct verifier_state_list *sl, *sln; + int i; + + if (!env->explored_states) + return; + + for (i = 0; i < env->prog->len; i++) { + sl = env->explored_states[i]; + + if (sl) + while (sl != STATE_LIST_MARK) { + sln = sl->next; + kfree(sl); + sl = sln; + } + } + + kfree(env->explored_states); +} + +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +{ + char __user *log_ubuf = NULL; + struct verifier_env *env; + int ret = -EINVAL; + + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->prog = *prog; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct verifier_state_list *), + GFP_USER); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!env->prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(env->prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + env->prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + if (!env->prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); + *prog = env->prog; + kfree(env); + mutex_unlock(&bpf_verifier_lock); + return ret; +} diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c018ce..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) } __setup("no_file_caps", file_caps_disable); +#ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * @@ -258,6 +259,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) i++; } + effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + new = prepare_creds(); if (!new) return -ENOMEM; @@ -382,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); +#endif /* CONFIG_MULTIUSER */ + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check @@ -408,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, EXPORT_SYMBOL(file_ns_capable); /** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - -/** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776aec2562..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; + /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG - | (1 << debug_cgrp_id) -#endif - ; +static unsigned int cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -180,18 +182,18 @@ static u64 css_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; -static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, @@ -275,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; + /* + * This function is used while updating css associations and thus + * can't test the csses directly. Use ->child_subsys_mask. + */ while (cgroup_parent(cgrp) && !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); @@ -282,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * The returned css must be put using css_put(). + */ +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + do { + css = cgroup_css(cgrp, ss); + + if (css && css_tryget_online(css)) + goto out_unlock; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + css = init_css_set.subsys[ss->id]; + css_get(css); +out_unlock: + rcu_read_unlock(); + return css; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -327,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) return false; } -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ - const int bits = - (1 << CGRP_RELEASABLE) | - (1 << CGRP_NOTIFY_ON_RELEASE); - return (cgrp->flags & bits) == bits; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -390,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); /* @@ -494,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset, bool taskexit) +static void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -520,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links)) { cgroup_update_populated(cgrp, false); - if (notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } + check_for_release(cgrp); } kfree(link); @@ -533,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset, bool taskexit) +static void put_css_set(struct css_set *cset) { /* * Ensure that the refcount doesn't hit zero while any readers @@ -544,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit) return; down_write(&css_set_rwsem); - put_css_set_locked(cset, taskexit); + put_css_set_locked(cset); up_write(&css_set_rwsem); } @@ -965,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits. Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at @@ -1031,12 +1045,81 @@ static void cgroup_get(struct cgroup *cgrp) css_get(&cgrp->self); } +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } /** + * cgroup_calc_child_subsys_mask - calculate child_subsys_mask + * @cgrp: the target cgroup + * @subtree_control: the new subtree_control mask to consider + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask. In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function calculates which subsystems need to be enabled if + * @subtree_control is to be applied to @cgrp. The returned mask is always + * a superset of @subtree_control and follows the usual hierarchy rules. + */ +static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned int subtree_control) +{ + struct cgroup *parent = cgroup_parent(cgrp); + unsigned int cur_ss_mask = subtree_control; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + if (!cgroup_on_dfl(cgrp)) + return cur_ss_mask; + + while (true) { + unsigned int new_ss_mask = cur_ss_mask; + + for_each_subsys(ss, ssid) + if (cur_ss_mask & (1 << ssid)) + new_ss_mask |= ss->depends_on; + + /* + * Mask out subsystems which aren't available. This can + * happen only if some depended-upon subsystems were bound + * to non-default hierarchies. + */ + if (parent) + new_ss_mask &= parent->child_subsys_mask; + else + new_ss_mask &= cgrp->root->subsys_mask; + + if (new_ss_mask == cur_ss_mask) + break; + cur_ss_mask = new_ss_mask; + } + + return cur_ss_mask; +} + +/** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * Update @cgrp->child_subsys_mask according to the current + * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + cgrp->child_subsys_mask = + cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); +} + +/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1091,7 +1174,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ - cgroup_get(cgrp); + if (!cgroup_tryget(cgrp)) + return NULL; kernfs_break_active_protection(kn); mutex_lock(&cgroup_mutex); @@ -1208,12 +1292,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + src_root->cgrp.subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(&src_root->cgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) { + dst_root->cgrp.subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + } if (ss->bind) ss->bind(css); @@ -1233,8 +1320,6 @@ static int cgroup_show_options(struct seq_file *seq, for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) - seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1353,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) bool all_ss = false, one_ss = false; unsigned int mask = -1U; struct cgroup_subsys *ss; + int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS @@ -1277,6 +1363,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + if (!*token) return -EINVAL; if (!strcmp(token, "none")) { @@ -1361,37 +1449,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* Consistency checks */ - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || - opts->cpuset_clone_children || opts->release_agent || - opts->name) { - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + if (nr_opts != 1) { + pr_err("sane_behavior: no other mount options allowed\n"); return -EINVAL; } - } else { - /* - * If the 'all' option was specified select all the - * subsystems, otherwise if 'none', 'name=' and a subsystem - * name options were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So - * all empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + return 0; } /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. @@ -1399,7 +1483,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; - /* Can't specify "none" and some subsystems */ if (opts->subsys_mask && opts->none) return -EINVAL; @@ -1414,8 +1497,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) struct cgroup_sb_opts opts; unsigned int added_mask, removed_mask; - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: remount is not allowed\n"); + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); return -EINVAL; } @@ -1434,11 +1517,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || + if ((opts.flags ^ root->flags) || (opts.name && strcmp(opts.name, root->name))) { pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", - root->flags & CGRP_ROOT_OPTION_MASK, root->name); + opts.flags, opts.name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1527,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; @@ -1537,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); + INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } static void init_cgroup_root(struct cgroup_root *root, @@ -1563,6 +1645,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1573,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -1600,7 +1684,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (root == &cgrp_dfl_root) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(root_cgrp, base_files, true); if (ret) goto destroy_root; @@ -1638,7 +1727,7 @@ destroy_root: exit_root_id: cgroup_exit_root_id(root); cancel_ref: - percpu_ref_cancel_init(&root_cgrp->self.refcnt); + percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1672,7 +1761,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; /* look for a matching existing root */ - if (!opts.subsys_mask && !opts.none && !opts.name) { + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { cgrp_dfl_root_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); @@ -1730,15 +1819,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto out_unlock; - } else { - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - } - } + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); /* * We want to reuse @root whose lifetime is governed by its @@ -1827,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) * * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self) || + if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else @@ -1993,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * task. As trading it for new_cset is protected by cgroup_mutex, * we're safe to drop it here; it will be freed under RCU. */ - set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set_locked(old_cset, false); + put_css_set_locked(old_cset); } /** @@ -2015,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); - put_css_set_locked(cset, false); + put_css_set_locked(cset); } up_write(&css_set_rwsem); } @@ -2109,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); - put_css_set(src_cset, false); - put_css_set(dst_cset, false); + put_css_set(src_cset); + put_css_set(dst_cset); continue; } @@ -2119,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (list_empty(&dst_cset->mg_preload_node)) list_add(&dst_cset->mg_preload_node, &csets); else - put_css_set(dst_cset, false); + put_css_set(dst_cset); } list_splice_tail(&csets, preloaded_csets); @@ -2457,9 +2538,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_puts(seq, "0\n"); return 0; } @@ -2496,7 +2575,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); return 0; } @@ -2505,7 +2584,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } @@ -2611,6 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; + unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2650,50 +2730,27 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { - if (cgrp->child_subsys_mask & (1 << ssid)) { + if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } - /* - * Because css offlining is asynchronous, userland - * might try to re-enable the same controller while - * the previous instance is still around. In such - * cases, wait till it's gone using offline_waitq. - */ - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - /* unavailable or not enabled on the parent? */ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { - if (!(cgrp->child_subsys_mask & (1 << ssid))) { + if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { - if (child->child_subsys_mask & (1 << ssid)) { + if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } @@ -2707,7 +2764,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2773,122 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Create csses for enables and update child_subsys_mask. This - * changes cgroup_e_css() results which in turn makes the - * subsequent cgroup_update_dfl_csses() associate all tasks in the - * subtree to the updated csses. + * Update subsys masks and calculate what needs to be done. More + * subsystems than specified may need to be enabled or disabled + * depending on subsystem dependencies. + */ + old_sc = cgrp->subtree_control; + old_ss = cgrp->child_subsys_mask; + new_sc = (old_sc | enable) & ~disable; + new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); + + css_enable = ~old_ss & new_ss; + css_disable = old_ss & ~new_ss; + enable |= css_enable; + disable |= css_disable; + + /* + * Because css offlining is asynchronous, userland might try to + * re-enable the same controller while the previous instance is + * still around. In such cases, wait till it's gone using + * offline_waitq. + */ + for_each_subsys(ss, ssid) { + if (!(css_enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); + + if (!cgroup_css(child, ss)) + continue; + + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); + + return restart_syscall(); + } + } + + cgrp->subtree_control = new_sc; + cgrp->child_subsys_mask = new_ss; + + /* + * Create new csses or make the existing ones visible. A css is + * created invisible if it's being implicitly enabled through + * dependency. An invisible css is made visible when the userland + * explicitly enables it. */ for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) continue; cgroup_for_each_live_child(child, cgrp) { - ret = create_css(child, ss); + if (css_enable & (1 << ssid)) + ret = create_css(child, ss, + cgrp->subtree_control & (1 << ssid)); + else + ret = cgroup_populate_dir(child, 1 << ssid); if (ret) goto err_undo_css; } } - cgrp->child_subsys_mask |= enable; - cgrp->child_subsys_mask &= ~disable; - + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ ret = cgroup_update_dfl_csses(cgrp); if (ret) goto err_undo_css; - /* all tasks are now migrated away from the old csses, kill them */ + /* + * All tasks are migrated out of disabled csses. Kill or hide + * them. A css is hidden when the userland requests it to be + * disabled while other subsystems are still depending on it. The + * css must not actively control resources and be in the vanilla + * state if it's made visible again later. Controllers which may + * be depended upon should provide ->css_reset() for this purpose. + */ for_each_subsys(ss, ssid) { if (!(disable & (1 << ssid))) continue; - cgroup_for_each_live_child(child, cgrp) - kill_css(cgroup_css(child, ss)); + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + + if (css_disable & (1 << ssid)) { + kill_css(css); + } else { + cgroup_clear_dir(child, 1 << ssid); + if (ss->css_reset) + ss->css_reset(css); + } + } + } + + /* + * The effective csses of all the descendants (excluding @cgrp) may + * have changed. Subsystems can optionally subscribe to this event + * by implementing ->css_e_css_changed() which is invoked if any of + * the effective csses seen from the css's cgroup may have changed. + */ + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); + struct cgroup_subsys_state *css; + + if (!ss->css_e_css_changed || !this_css) + continue; + + css_for_each_descendant_pre(css, this_css) + if (css != this_css) + ss->css_e_css_changed(css); } kernfs_activate(cgrp->kn); @@ -2755,8 +2898,8 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->child_subsys_mask &= ~enable; - cgrp->child_subsys_mask |= disable; + cgrp->subtree_control = old_sc; + cgrp->child_subsys_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -2764,8 +2907,14 @@ err_undo_css: cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); - if (css) + + if (!css) + continue; + + if (css_enable & (1 << ssid)) kill_css(css); + else + cgroup_clear_dir(child, 1 << ssid); } } goto out_unlock; @@ -2878,9 +3027,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, /* * This isn't a proper migration and its usefulness is very - * limited. Disallow if sane_behavior. + * limited. Disallow on the default hierarchy. */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return -EPERM; /* @@ -2928,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, - NULL, false, key); + NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -2964,9 +3113,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; - if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; @@ -3024,6 +3173,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; + + /* revert flags set by cgroup core while adding @cfts */ + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); } } @@ -3109,7 +3261,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -3135,6 +3287,49 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the default hierarchy. + */ +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_ONLY_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the legacy hierarchies. + */ +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + /* + * If legacy_flies_on_dfl, we want to show the legacy files on the + * dfl hierarchy but iff the target subsystem hasn't been updated + * for the dfl hierarchy yet. + */ + if (!cgroup_legacy_files_on_dfl || + ss->dfl_cftypes != ss->legacy_cftypes) { + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + } + + return cgroup_add_cftypes(ss, cfts); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3611,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* @@ -3699,8 +3891,9 @@ after: * * All this extra complexity was caused by the original implementation * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if - * sane_behavior so that no such expectation exists in the new interface. + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. * * Scrambling is done by swapping every two consecutive bits, which is * non-identity one-to-one mapping which disturbs sort order sufficiently. @@ -3715,7 +3908,7 @@ static pid_t pid_fry(pid_t pid) static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return pid_fry(pid); else return pid; @@ -3818,7 +4011,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) sort(array, length, sizeof(pid_t), fried_cmppid, NULL); else sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3827,7 +4020,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l = cgroup_pidlist_find_create(cgrp, type); if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } @@ -4004,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) static int cgroup_pidlist_show(struct seq_file *s, void *v) { - return seq_printf(s, "%d\n", *(int *)v); + seq_printf(s, "%d\n", *(int *)v); + + return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, @@ -4016,7 +4210,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else @@ -4040,7 +4233,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, return 0; } -static struct cftype cgroup_base_files[] = { +/* cgroup core interface files for the default hierarchy */ +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -4052,46 +4246,52 @@ static struct cftype cgroup_base_files[] = { .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.clone_children", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_root_controllers_show, }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_populated_show, }, + { } /* terminate */ +}; - /* - * Historical crazy stuff. These don't have "cgroup." prefix and - * don't exist if sane_behavior. If you're depending on these, be - * prepared to be burned. - */ +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, { .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, @@ -4102,13 +4302,12 @@ static struct cftype cgroup_base_files[] = { }, { .name = "notify_on_release", - .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", - .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, @@ -4173,19 +4372,26 @@ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - if (css->ss) { + percpu_ref_exit(&css->refcnt); + + if (ss) { /* css free path */ + int id = css->id; + if (css->parent) css_put(css->parent); - css->ss->css_free(css); + ss->css_free(css); + cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); + cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* @@ -4231,11 +4437,22 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ - cgroup_idr_remove(&ss->css_idr, css->id); + cgroup_idr_replace(&ss->css_idr, NULL, css->id); + if (ss->css_released) + ss->css_released(css); } else { /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; + + /* + * There are two control paths which try to determine + * cgroup from dentry without going through kernfs - + * cgroupstats_build() and css_tryget_online_from_dir(). + * Those are supported by RCU protecting clearing of + * cgrp->kn->priv backpointer. + */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } mutex_unlock(&cgroup_mutex); @@ -4314,12 +4531,14 @@ static void offline_css(struct cgroup_subsys_state *css) * create_css - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css + * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created. - * Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp with all interface files created if + * @visible. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4334,7 +4553,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4343,9 +4562,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_percpu_ref; css->id = err; - err = cgroup_populate_dir(cgrp, 1 << ss->id); - if (err) - goto err_free_id; + if (visible) { + err = cgroup_populate_dir(cgrp, 1 << ss->id); + if (err) + goto err_free_id; + } /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); @@ -4372,7 +4593,7 @@ err_list_del: err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: - percpu_ref_cancel_init(&css->refcnt); + percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; @@ -4385,8 +4606,14 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. + */ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; @@ -4399,7 +4626,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; @@ -4455,14 +4682,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (cgroup_on_dfl(cgrp)) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(cgrp, base_files, true); if (ret) goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss); + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); if (ret) goto out_destroy; } @@ -4470,10 +4703,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * On the default hierarchy, a child doesn't automatically inherit - * child_subsys_mask from the parent. Each is configured manually. + * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) - cgrp->child_subsys_mask = parent->child_subsys_mask; + if (!cgroup_on_dfl(cgrp)) { + cgrp->subtree_control = parent->subtree_control; + cgroup_refresh_child_subsys_mask(cgrp); + } kernfs_activate(kn); @@ -4483,7 +4718,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: - percpu_ref_cancel_init(&cgrp->self.refcnt); + percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4621,19 +4856,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* CSS_ONLINE is clear, remove from ->release_list for the last time */ - raw_spin_lock(&release_list_lock); - if (!list_empty(&cgrp->release_list)) - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); check_for_release(cgroup_parent(cgrp)); /* put the base reference */ @@ -4650,23 +4878,10 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgrp = cgroup_kn_lock_live(kn); if (!cgrp) return 0; - cgroup_get(cgrp); /* for @kn->priv clearing */ ret = cgroup_destroy_locked(cgrp); cgroup_kn_unlock(kn); - - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ - if (!ret) - RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - - cgroup_put(cgrp); return ret; } @@ -4736,8 +4951,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = - { .flags = CGRP_ROOT_SANE_BEHAVIOR }; + static struct cgroup_sb_opts __initdata opts; struct cgroup_subsys *ss; int i; @@ -4775,7 +4989,8 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); mutex_lock(&cgroup_mutex); @@ -4807,10 +5022,26 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (!ss->disabled) { - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + if (ss->disabled) + continue; + + cgrp_dfl_root.subsys_mask |= 1 << ss->id; + + if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) + ss->dfl_cftypes = ss->legacy_cftypes; + + if (!ss->dfl_cftypes) + cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + + if (ss->dfl_cftypes == ss->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); + } else { + WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); + WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4857,12 +5088,9 @@ core_initcall(cgroup_wq_init); * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ - -/* TODO: Use a proper seq_file iterator */ -int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *path; int retval; struct cgroup_root *root; @@ -4872,14 +5100,6 @@ int proc_cgroup_show(struct seq_file *m, void *v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = 0; - mutex_lock(&cgroup_mutex); down_read(&css_set_rwsem); @@ -4909,11 +5129,10 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_putc(m, '\n'); } + retval = 0; out_unlock: up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - put_task_struct(tsk); -out_free: kfree(buf); out: return retval; @@ -4984,7 +5203,7 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * This may race against cgroup_enable_task_cg_links(). As that + * This may race against cgroup_enable_task_cg_lists(). As that * function sets use_task_css_set_links before grabbing * tasklist_lock and we just went through tasklist_lock to add * @child, it's guaranteed that either we see the set @@ -4999,7 +5218,7 @@ void cgroup_post_fork(struct task_struct *child) * when implementing operations which need to migrate all tasks of * a cgroup to another. * - * Note that if we lose to cgroup_enable_task_cg_links(), @child + * Note that if we lose to cgroup_enable_task_cg_lists(), @child * will remain in init_css_set. This is safe because all tasks are * in the init_css_set before cg_links is enabled and there's no * operation which transfers all tasks out of init_css_set. @@ -5083,30 +5302,14 @@ void cgroup_exit(struct task_struct *tsk) } if (put_cset) - put_css_set(cset, true); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && - !css_has_online_children(&cgrp->self)) { - /* - * Control Group is currently removeable. If it's not - * already queued for a userspace notification, queue - * it now - */ - int need_schedule_work = 0; - - raw_spin_lock(&release_list_lock); - if (!cgroup_is_dead(cgrp) && - list_empty(&cgrp->release_list)) { - list_add(&cgrp->release_list, &release_list); - need_schedule_work = 1; - } - raw_spin_unlock(&release_list_lock); - if (need_schedule_work) - schedule_work(&release_agent_work); - } + if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); } /* @@ -5134,52 +5337,39 @@ static void check_for_release(struct cgroup *cgrp) */ static void cgroup_release_agent(struct work_struct *work) { - BUG_ON(work != &release_agent_work); + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL, *path; + char *argv[3], *envp[3]; + mutex_lock(&cgroup_mutex); - raw_spin_lock(&release_list_lock); - while (!list_empty(&release_list)) { - char *argv[3], *envp[3]; - int i; - char *pathbuf = NULL, *agentbuf = NULL, *path; - struct cgroup *cgrp = list_entry(release_list.next, - struct cgroup, - release_list); - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathbuf) - goto continue_free; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); - if (!path) - goto continue_free; - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!agentbuf) - goto continue_free; - - i = 0; - argv[i++] = agentbuf; - argv[i++] = path; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - /* Drop the lock while we invoke the usermode helper, - * since the exec could involve hitting disk and hence - * be a slow process */ - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - mutex_lock(&cgroup_mutex); - continue_free: - kfree(pathbuf); - kfree(agentbuf); - raw_spin_lock(&release_list_lock); - } - raw_spin_unlock(&release_list_lock); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) + goto out; + + argv[0] = agentbuf; + argv[1] = path; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); } static int __init cgroup_disable(char *str) @@ -5205,6 +5395,14 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_set_legacy_files_on_dfl(char *str) +{ + printk("cgroup: using legacy files on the default hierarchy\n"); + cgroup_legacy_files_on_dfl = true; + return 0; +} +__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5231,7 +5429,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See cgroup_rmdir() for details. + * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) @@ -5255,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&ss->css_idr, id); + return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } #ifdef CONFIG_CGROUP_DEBUG @@ -5359,7 +5557,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); + return (!cgroup_has_tasks(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); } static struct cftype debug_files[] = { @@ -5399,6 +5598,6 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .base_cftypes = debug_files, + .legacy_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f9d700..92b98cc0ee76 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = { .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, - .base_cftypes = files, + .legacy_cftypes = files, }; diff --git a/kernel/compat.c b/kernel/compat.c index 633394f442f8..24f00610c575 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); - if (ret) { + if (ret == -ERESTART_RESTARTBLOCK) { rmtp = restart->nanosleep.compat_rmtp; if (rmtp && compat_put_timespec(&rmt, rmtp)) @@ -256,9 +256,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); - if (ret) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; + /* + * hrtimer_nanosleep() can only return 0 or + * -ERESTART_RESTARTBLOCK here because: + * + * - we call it with HRTIMER_MODE_REL and therefor exclude the + * -ERESTARTNOHAND return path. + * + * - we supply the rmtp argument from the task stack (due to + * the necessary compat conversion. So the update cannot + * fail, which excludes the -EFAULT return path as well. If + * it fails nevertheless we have a bigger problem and wont + * reach this place anymore. + * + * - if the return value is 0, we do not have to update rmtp + * because there is no remaining time. + * + * We check for -ERESTART_RESTARTBLOCK nevertheless if the + * core implementation decides to return random nonsense. + */ + if (ret == -ERESTART_RESTARTBLOCK) { + struct restart_block *restart = ¤t->restart_block; restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; @@ -266,7 +284,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } - return ret; } @@ -842,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = compat_clock_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; } diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 000000000000..c2de56ab0fce --- /dev/null +++ b/kernel/configs/tiny.config @@ -0,0 +1,4 @@ +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_KERNEL_XZ=y +CONFIG_OPTIMIZE_INLINING=y +CONFIG_SLOB=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..72d59a1a6eb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu) } /** - * context_tracking_user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_enter - Inform the context tracking that the CPU is going + * enter user or guest space mode. * * This function must be called right before we switch from the kernel - * to userspace, when it's guaranteed the remaining kernel instructions - * to execute won't use any RCU read side critical section because this - * function sets RCU in extended quiescent state. + * to user or guest space, when it's guaranteed the remaining kernel + * instructions to execute won't use any RCU read side critical section + * because this function sets RCU in extended quiescent state. */ -void context_tracking_user_enter(void) +void context_tracking_enter(enum ctx_state state) { unsigned long flags; @@ -75,9 +75,8 @@ void context_tracking_user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { - trace_user_enter(0); /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be @@ -85,7 +84,10 @@ void context_tracking_user_enter(void) * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency * on the tick. */ - vtime_user_enter(current); + if (state == CONTEXT_USER) { + trace_user_enter(0); + vtime_user_enter(current); + } rcu_user_enter(); } /* @@ -101,64 +103,32 @@ void context_tracking_user_enter(void) * OTOH we can spare the calls to vtime and RCU when context_tracking.active * is false because we know that CPU is not tickless. */ - __this_cpu_write(context_tracking.state, IN_USER); + __this_cpu_write(context_tracking.state, state); } local_irq_restore(flags); } -NOKPROBE_SYMBOL(context_tracking_user_enter); +NOKPROBE_SYMBOL(context_tracking_enter); +EXPORT_SYMBOL_GPL(context_tracking_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) +void context_tracking_user_enter(void) { - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); + context_tracking_enter(CONTEXT_USER); } -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ +NOKPROBE_SYMBOL(context_tracking_user_enter); /** - * context_tracking_user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_exit - Inform the context tracking that the CPU is + * exiting user or guest mode and entering the kernel. * - * This function must be called after we entered the kernel from userspace - * before any use of RCU read side critical section. This potentially include - * any high level kernel code like syscalls, exceptions, signal handling, etc... + * This function must be called after we entered the kernel from user or + * guest space before any use of RCU read side critical section. This + * potentially include any high level kernel code like syscalls, exceptions, + * signal handling, etc... * * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_user_exit(void) +void context_tracking_exit(enum ctx_state state) { unsigned long flags; @@ -169,20 +139,29 @@ void context_tracking_user_exit(void) return; local_irq_save(flags); - if (__this_cpu_read(context_tracking.state) == IN_USER) { + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform * RCU core about that (ie: we may need the tick again). */ rcu_user_exit(); - vtime_user_exit(current); - trace_user_exit(0); + if (state == CONTEXT_USER) { + vtime_user_exit(current); + trace_user_exit(0); + } } - __this_cpu_write(context_tracking.state, IN_KERNEL); + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_exit); +EXPORT_SYMBOL_GPL(context_tracking_exit); + +void context_tracking_user_exit(void) +{ + context_tracking_exit(CONTEXT_USER); +} NOKPROBE_SYMBOL(context_tracking_user_exit); /** diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde710b1..94bbe4695232 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include <linux/gfp.h> #include <linux/suspend.h> #include <linux/lockdep.h> +#include <linux/tick.h> #include <trace/events/power.h> #include "smpboot.h" @@ -58,20 +59,23 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ + /* wait queue to wake up the active_writer */ + wait_queue_head_t wq; + /* verifies that no writer will get active while readers are active */ + struct mutex lock; /* * Also blocks the new readers during * an ongoing cpu hotplug operation. */ - int refcount; + atomic_t refcount; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } cpu_hotplug = { .active_writer = NULL, + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, #ifdef CONFIG_DEBUG_LOCK_ALLOC .dep_map = {.name = "cpu_hotplug.lock" }, #endif @@ -79,9 +83,12 @@ static struct { /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire_tryread() \ + lock_map_acquire_tryread(&cpu_hotplug.dep_map) #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) + void get_online_cpus(void) { might_sleep(); @@ -89,24 +96,38 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); - } EXPORT_SYMBOL_GPL(get_online_cpus); +bool try_get_online_cpus(void) +{ + if (cpu_hotplug.active_writer == current) + return true; + if (!mutex_trylock(&cpu_hotplug.lock)) + return false; + cpuhp_lock_acquire_tryread(); + atomic_inc(&cpu_hotplug.refcount); + mutex_unlock(&cpu_hotplug.lock); + return true; +} +EXPORT_SYMBOL_GPL(try_get_online_cpus); + void put_online_cpus(void) { + int refcount; + if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); - if (WARN_ON(!cpu_hotplug.refcount)) - cpu_hotplug.refcount++; /* try to fix things up */ + refcount = atomic_dec_return(&cpu_hotplug.refcount); + if (WARN_ON(refcount < 0)) /* try to fix things up */ + atomic_inc(&cpu_hotplug.refcount); + + if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) + wake_up(&cpu_hotplug.wq); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); cpuhp_lock_release(); } @@ -136,17 +157,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus); */ void cpu_hotplug_begin(void) { - cpu_hotplug.active_writer = current; + DEFINE_WAIT(wait); + cpu_hotplug.active_writer = current; cpuhp_lock_acquire(); + for (;;) { mutex_lock(&cpu_hotplug.lock); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); + if (likely(!atomic_read(&cpu_hotplug.refcount))) + break; mutex_unlock(&cpu_hotplug.lock); schedule(); } + finish_wait(&cpu_hotplug.wq, &wait); } void cpu_hotplug_done(void) @@ -274,21 +298,28 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int cpu) +static inline void check_for_tasks(int dead_cpu) { - struct task_struct *p; - cputime_t utime, stime; + struct task_struct *g, *p; - write_lock_irq(&tasklist_lock); - for_each_process(p) { - task_cputime(p, &utime, &stime); - if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (utime || stime)) - pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); - } - write_unlock_irq(&tasklist_lock); + read_lock_irq(&tasklist_lock); + do_each_thread(g, p) { + if (!p->on_rq) + continue; + /* + * We do the check with unlocked task_rq(p)->lock. + * Order the reading to do not warn about a task, + * which was running on this cpu in the past, and + * it's just been woken on another cpu. + */ + rmb(); + if (task_cpu(p) != dead_cpu) + continue; + + pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", + p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); + } while_each_thread(g, p); + read_unlock_irq(&tasklist_lock); } struct take_cpu_down_param { @@ -308,6 +339,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Give up timekeeping duties */ + tick_handover_do_timer(); /* Park the stopper thread */ kthread_park(current); return 0; @@ -378,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!idle_cpu(cpu)) + while (!per_cpu(cpu_dead_idle, cpu)) cpu_relax(); + smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ + per_cpu(cpu_dead_idle, cpu) = false; + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ + tick_cleanup_dead_cpu(cpu); cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); @@ -416,6 +453,37 @@ out: EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ +/* + * Unpark per-CPU smpboot kthreads at CPU-online time. + */ +static int smpboot_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_ONLINE: + smpboot_unpark_threads(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block smpboot_thread_notifier = { + .notifier_call = smpboot_thread_call, + .priority = CPU_PRI_SMPBOOT, +}; + +void __cpuinit smpboot_thread_init(void) +{ + register_cpu_notifier(&smpboot_thread_notifier); +} + /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen) { @@ -455,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - /* Wake the per cpu threads */ - smpboot_unpark_threads(cpu); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..ee14e3a35a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierachy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; /* * This is old Memory Nodes tasks took on. @@ -222,34 +248,34 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global mutexes guarding cpuset structures - cpuset_mutex - * and callback_mutex. The latter may nest inside the former. We also - * require taking task_lock() when dereferencing a task's cpuset pointer. - * See "The task_lock() exception", at the end of this comment. + * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this + * comment. * - * A task must hold both mutexes to modify cpusets. If a task holds + * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it - * is the only task able to also acquire callback_mutex and be able to + * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while * just holding cpuset_mutex. While it is performing these checks, various - * callback routines can briefly acquire callback_mutex to query cpusets. - * Once it is ready to make the changes, it takes callback_mutex, blocking + * callback routines can briefly acquire callback_lock to query cpusets. + * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex + * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_mutex, then it has read-only + * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_mutex across + * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -258,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -303,13 +329,13 @@ static struct file_system_type cpuset_fs_type = { * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) cs = parent_cs(cs); - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); + cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } /* @@ -321,31 +347,32 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) + while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); + nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cpuset_mutex held + * Call with callback_lock or cpuset_mutex held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) { if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; + task_set_spread_page(tsk); else - tsk->flags &= ~PF_SPREAD_PAGE; + task_clear_spread_page(tsk); + if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; + task_set_spread_slab(tsk); else - tsk->flags &= ~PF_SPREAD_SLAB; + task_clear_spread_slab(tsk); } /* @@ -376,13 +403,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { - kfree(trial); - return NULL; - } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) + goto free_cpus; + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; + +free_cpus: + free_cpumask_var(trial->cpus_allowed); +free_cs: + kfree(trial); + return NULL; } /** @@ -391,6 +425,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -436,9 +471,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* We must be a subset of our parent cpuset */ + /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_cpuset_subset(trial, par)) + if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) goto out; /* @@ -471,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) goto out; } + /* + * We can't shrink if we won't have enough room for SCHED_DEADLINE + * tasks. + */ + ret = -EBUSY; + if (is_cpu_exclusive(cur) && + !cpuset_cpumask_can_shrink(cur->cpus_allowed, + trial->cpus_allowed)) + goto out; + ret = 0; out: rcu_read_unlock(); @@ -480,11 +525,11 @@ out: #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void @@ -503,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) - continue; - /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -580,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -589,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -601,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.cpus_allowed); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -624,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -705,7 +754,8 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpumask_or(dp, dp, b->cpus_allowed); + cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -718,6 +768,7 @@ restart: BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* @@ -757,7 +808,7 @@ static void rebuild_sched_domains_locked(void) * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ - if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ @@ -781,45 +832,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - * if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ - while (cpumask_empty(cs->cpus_allowed)) - cs = parent_cs(cs); - return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - * if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ - while (nodes_empty(cs->mems_allowed)) - cs = parent_cs(cs); - return cs; -} - /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +842,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) */ static void update_tasks_cpumask(struct cpuset *cs) { - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); } /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated. * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; + bool need_rebuild_sched_domains = false; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs. + */ + if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent->effective_cpus); + + /* Skip the whole subtree if the cpumask remains the same. */ + if (cpumask_equal(new_cpus, cp->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + spin_lock_irq(&callback_lock); + cpumask_copy(cp->effective_cpus, new_cpus); + spin_unlock_irq(&callback_lock); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); + update_tasks_cpumask(cp); + /* + * If the effective cpumask of any non-empty cpuset is changed, + * we need to rebuild sched domains. + */ + if (!cpumask_empty(cp->cpus_allowed) && + is_sched_load_balance(cp)) + need_rebuild_sched_domains = true; + rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); + + if (need_rebuild_sched_domains) + rebuild_sched_domains_locked(); } /** @@ -889,7 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -908,7 +946,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, + top_cpuset.cpus_allowed)) return -EINVAL; } @@ -920,16 +959,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - is_load_balanced = is_sched_load_balance(trialcs); - - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); - update_tasks_cpumask_hier(cs, true); - - if (is_load_balanced) - rebuild_sched_domains_locked(); + /* use trialcs->cpus_allowed as a temp variable */ + update_cpumasks_hier(cs, trialcs->cpus_allowed); return 0; } @@ -951,15 +986,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; - struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &tsk->mems_allowed); + guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); rcu_read_unlock(); } @@ -1028,13 +1061,12 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - guarantee_online_mems(mems_cs, &newmems); + guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1109,52 @@ static void update_tasks_nodemask(struct cpuset *cs) } /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some MEMs. + */ + if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) + *new_mems = parent->effective_mems; + + /* Skip the whole subtree if the nodemask remains the same. */ + if (nodes_equal(*new_mems, cp->effective_mems)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + spin_lock_irq(&callback_lock); + cp->effective_mems = *new_mems; + spin_unlock_irq(&callback_lock); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); + update_tasks_nodemask(cp); rcu_read_lock(); @@ -1123,7 +1171,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1156,8 +1204,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_MEMORY])) { - retval = -EINVAL; + top_cpuset.mems_allowed)) { + retval = -EINVAL; goto done; } } @@ -1170,11 +1218,12 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); - update_tasks_nodemask_hier(cs, true); + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &cs->mems_allowed); done: return retval; } @@ -1262,9 +1311,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); @@ -1389,27 +1438,15 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, mutex_lock(&cpuset_mutex); - /* - * We allow to move tasks into an empty cpuset if sane_behavior - * flag is set. - */ + /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_sane_behavior(css->cgroup) && + if (!cgroup_on_dfl(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; cgroup_taskset_for_each(task, tset) { - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - ret = -EINVAL; - if (task->flags & PF_NO_SETAFFINITY) + ret = task_can_attach(task, cs->cpus_allowed); + if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) @@ -1452,8 +1489,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *leader = cgroup_taskset_first(tset); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1461,9 +1496,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cpus_cs, cpus_attach); + guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, tset) { /* @@ -1480,11 +1515,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_to = cs->mems_allowed; + cpuset_attach_nodemask_to = cs->effective_mems; mm = get_task_mm(leader); if (mm) { - struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* @@ -1495,7 +1528,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * mm from. */ if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); } mmput(mm); @@ -1516,6 +1549,8 @@ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -1678,35 +1713,28 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; - ssize_t count; - char *buf, *s; int ret = 0; - count = seq_get_buf(sf, &buf); - s = buf; - - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: - s += cpulist_scnprintf(s, count, cs->cpus_allowed); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); break; case FILE_MEMLIST: - s += nodelist_scnprintf(s, count, cs->mems_allowed); + seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); + break; + case FILE_EFFECTIVE_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); + break; + case FILE_EFFECTIVE_MEMLIST: + seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; default: ret = -EINVAL; - goto out_unlock; } - if (s < buf + count - 1) { - *s++ = '\n'; - seq_commit(sf, s - buf); - } else { - seq_commit(sf, -1); - } -out_unlock: - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); return ret; } @@ -1779,6 +1807,18 @@ static struct cftype files[] = { }, { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, @@ -1869,18 +1909,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) + goto free_cpus; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); + cpumask_clear(cs->effective_cpus); + nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; + +free_cpus: + free_cpumask_var(cs->cpus_allowed); +free_cs: + kfree(cs); + return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1951,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); + spin_lock_irq(&callback_lock); + if (cgroup_on_dfl(cs->css.cgroup)) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + } + spin_unlock_irq(&callback_lock); + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1928,10 +1983,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - mutex_unlock(&callback_mutex); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); return 0; @@ -1962,20 +2019,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ + mutex_lock(&cpuset_mutex); + spin_lock_irq(&callback_lock); + + if (cgroup_on_dfl(root_css->cgroup)) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + top_cpuset.mems_allowed = node_possible_map; + } else { + cpumask_copy(top_cpuset.cpus_allowed, + top_cpuset.effective_cpus); + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + + spin_unlock_irq(&callback_lock); + mutex_unlock(&cpuset_mutex); +} + struct cgroup_subsys cpuset_cgrp_subsys = { - .css_alloc = cpuset_css_alloc, - .css_online = cpuset_css_online, - .css_offline = cpuset_css_offline, - .css_free = cpuset_css_free, - .can_attach = cpuset_can_attach, - .cancel_attach = cpuset_cancel_attach, - .attach = cpuset_attach, - .base_cftypes = files, - .early_init = 1, + .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, + .css_free = cpuset_css_free, + .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, + .attach = cpuset_attach, + .bind = cpuset_bind, + .legacy_cftypes = files, + .early_init = 1, }; /** @@ -1990,9 +2067,13 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); + cpumask_setall(top_cpuset.effective_cpus); + nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2116,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + spin_unlock_irq(&callback_lock); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migratecd to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + update_tasks_cpumask(cs); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; + spin_unlock_irq(&callback_lock); + + if (cpus_updated) + update_tasks_cpumask(cs); + if (mems_updated) + update_tasks_nodemask(cs); +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -2045,11 +2186,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void cpuset_hotplug_update_tasks(struct cpuset *cs) { - static cpumask_t off_cpus; - static nodemask_t off_mems; - bool is_empty; - bool sane = cgroup_sane_behavior(cs->css.cgroup); - + static cpumask_t new_cpus; + static nodemask_t new_mems; + bool cpus_updated; + bool mems_updated; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2204,20 @@ retry: goto retry; } - cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); - nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); + cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); - /* - * If sane_behavior flag is set, we need to update tasks' cpumask - * for empty cpuset to take on ancestor's cpumask. Otherwise, don't - * call update_tasks_cpumask() if the cpuset becomes empty, as - * the tasks in it will be migrated to an ancestor. - */ - if ((sane && cpumask_empty(cs->cpus_allowed)) || - (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs); - - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' nodemask - * for empty cpuset to take on ancestor's nodemask. Otherwise, don't - * call update_tasks_nodemask() if the cpuset becomes empty, as - * the tasks in it will be migratd to an ancestor. - */ - if ((sane && nodes_empty(cs->mems_allowed)) || - (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs); + cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); + mems_updated = !nodes_equal(new_mems, cs->effective_mems); - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); + if (cgroup_on_dfl(cs->css.cgroup)) + hotplug_update_tasks(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + else + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); mutex_unlock(&cpuset_mutex); - - /* - * If sane_behavior flag is set, we'll keep tasks in empty cpusets. - * - * Otherwise move tasks to the nearest ancestor with execution - * resources. This is full cgroup operation which will - * also call back into cpuset. Should be done outside any lock. - */ - if (!sane && is_empty) - remove_tasks_in_empty_cpuset(cs); } /** @@ -2132,6 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; + bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); mutex_lock(&cpuset_mutex); @@ -2139,22 +2249,26 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; - cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); - mutex_unlock(&callback_mutex); + spin_lock_irq(&callback_lock); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); + spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = new_mems; - mutex_unlock(&callback_mutex); + spin_lock_irq(&callback_lock); + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; + spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } @@ -2228,6 +2342,9 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; + cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); + top_cpuset.effective_mems = node_states[N_MEMORY]; + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); } @@ -2244,23 +2361,19 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - struct cpuset *cpus_cs; + unsigned long flags; - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - guarantee_online_cpus(cpus_cs, pmask); + guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - struct cpuset *cpus_cs; - rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* @@ -2282,7 +2395,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) */ } -void cpuset_init_current_mems_allowed(void) +void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } @@ -2299,15 +2412,14 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *mems_cs; nodemask_t mask; + unsigned long flags; - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &mask); + guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return mask; } @@ -2326,7 +2438,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) @@ -2337,44 +2449,28 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest - * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been - * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE - * flag, yes. + * If we're in interrupt, yes, we can always allocate. If @node is set in + * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this + * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, + * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall - * cpusets, and never sleeps. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The + * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. + * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, @@ -2391,20 +2487,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +int __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ + unsigned long flags; - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + if (in_interrupt()) return 1; - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; /* @@ -2420,55 +2511,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return allowed; } -/* - * cpuset_node_allowed_hardwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If the task has been OOM killed and has access to memory reserves as - * specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_node_allowed_softwall() variant, above, - * this variant requires that the node be in the current task's - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ -int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page @@ -2544,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } -#define CPUSET_NODELIST_LEN (256) - /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed * @tsk: pointer to task_struct of some task. @@ -2555,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { - /* Statically allocated to prevent using excess stack. */ - static char cpuset_nodelist[CPUSET_NODELIST_LEN]; - static DEFINE_SPINLOCK(cpuset_buffer_lock); struct cgroup *cgrp; - spin_lock(&cpuset_buffer_lock); rcu_read_lock(); cgrp = task_cs(tsk)->css.cgroup; - nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, - tsk->mems_allowed); pr_info("%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%s\n", cpuset_nodelist); + pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); rcu_read_unlock(); - spin_unlock(&cpuset_buffer_lock); } /* @@ -2617,10 +2661,9 @@ void __cpuset_memory_pressure_bump(void) * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *p; struct cgroup_subsys_state *css; int retval; @@ -2630,24 +2673,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - retval = -ENAMETOOLONG; rcu_read_lock(); css = task_css(tsk, cpuset_cgrp_id); p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); if (!p) - goto out_put_task; + goto out_free; seq_puts(m, p); seq_putc(m, '\n'); retval = 0; -out_put_task: - put_task_struct(tsk); out_free: kfree(buf); out: @@ -2658,10 +2693,8 @@ out: /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_puts(m, "Mems_allowed:\t"); - seq_nodemask(m, &task->mems_allowed); - seq_puts(m, "\n"); - seq_puts(m, "Mems_allowed_list:\t"); - seq_nodemask_list(m, &task->mems_allowed); - seq_puts(m, "\n"); + seq_printf(m, "Mems_allowed:\t%*pb\n", + nodemask_pr_args(&task->mems_allowed)); + seq_printf(m, "Mems_allowed_list:\t%*pbl\n", + nodemask_pr_args(&task->mems_allowed)); } diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index c766ee54c0b1..b64e238b553b 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -18,6 +18,7 @@ unsigned long saved_max_pfn; * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL_GPL(elfcorehdr_addr); /* * stores the size of elf header of crash image diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,6 +29,9 @@ static struct kmem_cache *cred_jar; +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + /* * The initial credentials for the initial task */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..0874e2edd275 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -27,6 +27,9 @@ * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ + +#define pr_fmt(fmt) "KGDB: " fmt + #include <linux/pid_namespace.h> #include <linux/clocksource.h> #include <linux/serial_core.h> @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) return err; err = kgdb_arch_remove_breakpoint(&tmp); if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); + pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", + addr); return err; } @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", - kgdb_break[i].bpt_addr); + pr_info("BP install failed: %lx\n", + kgdb_break[i].bpt_addr); continue; } @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) continue; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", - kgdb_break[i].bpt_addr); + pr_info("BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) goto setundefined; error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + pr_err("breakpoint remove failed: %lx\n", kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) if (print_wait) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); + pr_crit("waiting... or $3#33 for KDB\n"); #else - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + pr_crit("Waiting for remote debugger\n"); #endif } return 1; @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) exception_level = 0; kgdb_skipexception(ks->ex_vector, ks->linux_regs); dbg_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); + pr_crit("re-enter error: breakpoint removed %lx\n", addr); WARN_ON_ONCE(1); return 1; @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) panic("Recursive entry to debugger"); } - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + pr_crit("re-enter exception: ALL breakpoints killed\n"); #ifdef CONFIG_KGDB_KDB /* Allow kdb to debug itself one level */ return 0; @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int cpu; int trace_on = 0; int online_cpus = num_online_cpus(); + u64 time_left; kgdb_info[ks->cpu].enter_kgdb++; kgdb_info[ks->cpu].exception_state |= exception_state; @@ -595,9 +598,13 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + - atomic_read(&slaves_in_kgdb)) != online_cpus) + time_left = loops_per_jiffy * HZ; + while (kgdb_do_roundup && --time_left && + (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != + online_cpus) cpu_relax(); + if (!time_left) + pr_crit("Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -689,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + /* + * Avoid entering the debugger if we were triggered due to an oops + * but panic_timeout indicates the system should automatically + * reboot on panic. We don't want to get stuck waiting for input + * on such systems, especially if its "just" an oops. + */ + if (signo != SIGTRAP && panic_timeout) + return 1; memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); @@ -795,15 +810,15 @@ static struct console kgdbcons = { static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + pr_crit("ERROR: No KGDB I/O module available\n"); return; } if (!kgdb_connected) { #ifdef CONFIG_KGDB_KDB if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); + pr_crit("KGDB or $3#33 for KDB\n"); #else - printk(KERN_CRIT "Entering KGDB\n"); + pr_crit("Entering KGDB\n"); #endif } @@ -821,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self, unsigned long val, void *data) { + /* + * Avoid entering the debugger if we were triggered due to a panic + * We don't want to get stuck waiting for input from user in such case. + * panic_timeout indicates the system should automatically + * reboot on panic. + */ + if (panic_timeout) + return NOTIFY_DONE; + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", (char *)data); kgdb_breakpoint(); @@ -945,7 +969,7 @@ static void kgdb_initial_breakpoint(void) { kgdb_break_asap = 0; - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + pr_crit("Waiting for connection from remote gdb...\n"); kgdb_breakpoint(); } @@ -964,8 +988,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) if (dbg_io_ops) { spin_unlock(&kgdb_registration_lock); - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); + pr_err("Another I/O driver is already registered with KGDB\n"); return -EBUSY; } @@ -981,8 +1004,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_dbg_io_ops->name); + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); @@ -1017,8 +1039,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 70a504601dc3..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) bp->bph_length = 1; if ((argc + 1) != nextarg) { - if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) + if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0) bp->bp_type = BP_ACCESS_WATCHPOINT; - else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) + else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) bp->bp_type = BP_WRITE_WATCHPOINT; - else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) + else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0) bp->bp_type = BP_HARDWARE_BREAKPOINT; else return KDB_ARGCOUNT; @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) bp->bp_free = 1; - kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", - "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", - "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bp", kdb_bp, "[<vaddr>]", + "Set/Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bl", kdb_bp, "[<vaddr>]", + "Display breakpoints", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", - "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bc", kdb_bc, "<bpnum>", - "Clear Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("be", kdb_bc, "<bpnum>", - "Enable Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bd", kdb_bc, "<bpnum>", - "Disable Breakpoint", 0, KDB_REPEAT_NONE); - - kdb_register_repeat("ss", kdb_ss, "", - "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_flags("bph", kdb_bp, "[<vaddr>]", + "[datar [length]|dataw [length]] Set hw brk", 0, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); + kdb_register_flags("bc", kdb_bc, "<bpnum>", + "Clear Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("be", kdb_bc, "<bpnum>", + "Enable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + kdb_register_flags("bd", kdb_bc, "<bpnum>", + "Disable Breakpoint", 0, + KDB_ENABLE_FLOW_CTRL); + + kdb_register_flags("ss", kdb_ss, "", + "Single Step", 1, + KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); } + /* set CATASTROPHIC if the system contains unresponsive processors */ + for_each_online_cpu(i) + if (!kgdb_info[i].enter_kgdb) + KDB_FLAG_SET(CATASTROPHIC); if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { KDB_STATE_CLEAR(SSBPT); KDB_STATE_CLEAR(DOING_SS); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 7c70812caea5..fc1ef736253c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -439,7 +439,7 @@ poll_again: * substituted for %d, %x or %o in the prompt. */ -char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); @@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } -int vkdb_printf(const char *fmt, va_list ap) +int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; int linecount; @@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + /* + * This was a interactive search (using '/' at more + * prompt) and it has completed. Clear the flag. + */ + kdb_grepping_flag = 0; /* * at this point the string is a full line and * should be printed, up to the null. @@ -691,19 +697,20 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); + cp = (char *) printk_skip_level(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { - gdbstub_msg_write(kdb_buffer, retlen); + gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen; - cp = kdb_buffer; + len = retlen - (cp - kdb_buffer); + cp2 = cp; while (len--) { - dbg_io_ops->write_char(*cp); - cp++; + dbg_io_ops->write_char(*cp2); + cp2++; } } while (c) { - c->write(c, kdb_buffer, retlen); + c->write(c, cp, retlen - (cp - kdb_buffer)); touch_nmi_watchdog(); c = c->next; } @@ -711,7 +718,10 @@ kdb_printit: if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; - printk(KERN_INFO "%s", kdb_buffer); + if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK) + printk("%s", kdb_buffer); + else + pr_info("%s", kdb_buffer); } if (KDB_STATE(PAGER)) { @@ -794,11 +804,23 @@ kdb_printit: kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '/' && !kdb_grepping_flag) { + kdb_printf("\r"); + kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, + kdbgetenv("SEARCHPROMPT") ?: "search> "); + *strchrnul(kdb_grep_string, '\n') = '\0'; + kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; + suspend_grep = 1; /* for this recursion */ } else if (buf1[0] && buf1[0] != '\n') { /* user hit something other than enter */ suspend_grep = 1; /* for this recursion */ - kdb_printf("\nOnly 'q' or 'Q' are processed at more " - "prompt, input ignored\n"); + if (buf1[0] != '/') + kdb_printf( + "\nOnly 'q', 'Q' or '/' are processed at " + "more prompt, input ignored\n"); + else + kdb_printf("\n'/' cannot be used during | " + "grep filtering, input ignored\n"); } else if (kdb_grepping_flag) { /* user hit enter */ suspend_grep = 1; /* for this recursion */ @@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...) int r; va_start(ap, fmt); - r = vkdb_printf(fmt, ap); + r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap); va_end(ap); return r; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760305ca..4121345498e0 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -12,6 +12,7 @@ */ #include <linux/ctype.h> +#include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kmsg_dump.h> @@ -23,6 +24,7 @@ #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -42,8 +44,13 @@ #include <linux/slab.h> #include "kdb_private.h" -#define GREP_LEN 256 -char kdb_grep_string[GREP_LEN]; +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "kdb." + +static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; +module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); + +char kdb_grep_string[KDB_GREP_STRLEN]; int kdb_grepping_flag; EXPORT_SYMBOL(kdb_grepping_flag); int kdb_grep_leading; @@ -121,6 +128,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(BADLENGTH, "Invalid length field"), KDBMSG(NOBP, "No Breakpoint exists"), KDBMSG(BADADDR, "Invalid address"), + KDBMSG(NOPERM, "Permission denied"), }; #undef KDBMSG @@ -188,6 +196,26 @@ struct task_struct *kdb_curr_task(int cpu) } /* + * Check whether the flags of the current command and the permissions + * of the kdb console has allow a command to be run. + */ +static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, + bool no_args) +{ + /* permissions comes from userspace so needs massaging slightly */ + permissions &= KDB_ENABLE_MASK; + permissions |= KDB_ENABLE_ALWAYS_SAFE; + + /* some commands change group when launched with no arguments */ + if (no_args) + permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; + + flags |= KDB_ENABLE_ALL; + + return permissions & flags; +} + +/* * kdbgetenv - This function will return the character string value of * an environment variable. * Parameters: @@ -476,6 +504,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, kdb_symtab_t symtab; /* + * If the enable flags prohibit both arbitrary memory access + * and flow control then there are no reasonable grounds to + * provide symbol lookup. + */ + if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, + kdb_cmd_enabled, false)) + return KDB_NOPERM; + + /* * Process arguments which follow the following syntax: * * symbol | numeric-address [+/- numeric-offset] @@ -641,8 +678,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = 0; if (s->usable) - kdb_register(s->name, kdb_exec_defcmd, - s->usage, s->help, 0); + /* macros are always safe because when executed each + * internal command re-enters kdb_parse() and is + * safety checked individually. + */ + kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, + s->help, 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } if (!s->usable) @@ -827,7 +869,7 @@ static void parse_grep(const char *str) len = strlen(cp); if (!len) return; - if (len >= GREP_LEN) { + if (len >= KDB_GREP_STRLEN) { kdb_printf("search string too long\n"); return; } @@ -872,13 +914,12 @@ int kdb_parse(const char *cmdstr) char *cp; char *cpp, quoted; kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep; + int i, escaped, ignore_errors = 0, check_grep = 0; /* * First tokenize the command string. */ cp = (char *)cmdstr; - kdb_grepping_flag = check_grep = 0; if (KDB_FLAG(CMD_INTERRUPT)) { /* Previous command was interrupted, newline must not @@ -1003,25 +1044,22 @@ int kdb_parse(const char *cmdstr) if (i < kdb_max_commands) { int result; + + if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + return KDB_NOPERM; + KDB_STATE_SET(CMD); result = (*tp->cmd_func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - switch (tp->cmd_repeat) { - case KDB_REPEAT_NONE: - argc = 0; - if (argv[0]) - *(argv[0]) = '\0'; - break; - case KDB_REPEAT_NO_ARGS: - argc = 1; - if (argv[1]) - *(argv[1]) = '\0'; - break; - case KDB_REPEAT_WITH_ARGS: - break; - } + + if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + return result; + + argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + if (argv[argc]) + *(argv[argc]) = '\0'; return result; } @@ -1207,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); - kdb_dumpregs(regs); break; case KDB_REASON_SSTEP: case KDB_REASON_BREAK: @@ -1241,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ kdb_nextline = 1; KDB_STATE_CLEAR(SUPPRESS); + kdb_grepping_flag = 0; + /* ensure the old search does not leak into '/' commands */ + kdb_grep_string[0] = '\0'; cmdbuf = cmd_cur; *cmdbuf = '\0'; @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) */ static int kdb_sr(int argc, const char **argv) { + bool check_mask = + !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); + if (argc != 1) return KDB_ARGCOUNT; + kdb_trap_printk++; - __handle_sysrq(*argv[1], false); + __handle_sysrq(*argv[1], check_mask); kdb_trap_printk--; return 0; @@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); + kdb_printf("%4d ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) { state = 'F'; /* cpu is offline */ + } else if (!kgdb_info[i].enter_kgdb) { + state = 'D'; /* cpu is online but unresponsive */ } else { state = ' '; /* cpu is responding to kdb */ if (kdb_task_state_char(KDB_TSK(i)) == 'I') @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) return 0; if (!kt->cmd_name) continue; + if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + continue; if (strlen(kt->cmd_usage) > 20) space = "\n "; kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, @@ -2472,7 +2520,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) static void kdb_sysinfo(struct sysinfo *val) { struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); memset(val, 0, sizeof(*val)); val->uptime = uptime.tv_sec; val->loads[0] = avenrun[0]; @@ -2535,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv) #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" "Buffers: %8lu kB\n", - val.totalram, val.freeram, val.bufferram); + K(val.totalram), K(val.freeram), K(val.bufferram)); return 0; } @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) } /* - * kdb_register_repeat - This function is used to register a kernel + * kdb_register_flags - This function is used to register a kernel * debugger command. * Inputs: * cmd Command name @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) * zero for success, one if a duplicate command. */ #define kdb_command_extend 50 /* arbitrary */ -int kdb_register_repeat(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_repeat_t repeat) +int kdb_register_flags(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_cmdflags_t flags) { int i; kdbtab_t *kp; @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, kp->cmd_func = func; kp->cmd_usage = usage; kp->cmd_help = help; - kp->cmd_flags = 0; kp->cmd_minlen = minlen; - kp->cmd_repeat = repeat; + kp->cmd_flags = flags; return 0; } -EXPORT_SYMBOL_GPL(kdb_register_repeat); +EXPORT_SYMBOL_GPL(kdb_register_flags); /* * kdb_register - Compatibility register function for commands that do * not need to specify a repeat state. Equivalent to - * kdb_register_repeat with KDB_REPEAT_NONE. + * kdb_register_flags with flags set to 0. * Inputs: * cmd Command name * func Function to execute the command @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, char *help, short minlen) { - return kdb_register_repeat(cmd, func, usage, help, minlen, - KDB_REPEAT_NONE); + return kdb_register_flags(cmd, func, usage, help, minlen, 0); } EXPORT_SYMBOL_GPL(kdb_register); @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) for_each_kdbcmd(kp, i) kp->cmd_name = NULL; - kdb_register_repeat("md", kdb_md, "<vaddr>", + kdb_register_flags("md", kdb_md, "<vaddr>", "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", - "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", - "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mds", kdb_md, "<vaddr>", - "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", - "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("go", kdb_go, "[<vaddr>]", - "Continue Execution", 1, KDB_REPEAT_NONE); - kdb_register_repeat("rd", kdb_rd, "", - "Display Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", - "Modify Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ef", kdb_ef, "<vaddr>", - "Display exception frame", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", - "Stack traceback", 1, KDB_REPEAT_NONE); - kdb_register_repeat("btp", kdb_bt, "<pid>", - "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", - "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btt", kdb_bt, "<vaddr>", + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", + "Display Raw Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", + "Display Physical Memory", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mds", kdb_md, "<vaddr>", + "Display Memory Symbolically", 0, + KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); + kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", + "Modify Memory Contents", 0, + KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); + kdb_register_flags("go", kdb_go, "[<vaddr>]", + "Continue Execution", 1, + KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("rd", kdb_rd, "", + "Display Registers", 0, + KDB_ENABLE_REG_READ); + kdb_register_flags("rm", kdb_rm, "<reg> <contents>", + "Modify Registers", 0, + KDB_ENABLE_REG_WRITE); + kdb_register_flags("ef", kdb_ef, "<vaddr>", + "Display exception frame", 0, + KDB_ENABLE_MEM_READ); + kdb_register_flags("bt", kdb_bt, "[<vaddr>]", + "Stack traceback", 1, + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("btp", kdb_bt, "<pid>", + "Display stack for process <pid>", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, - KDB_REPEAT_NONE); - kdb_register_repeat("env", kdb_env, "", - "Show environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("set", kdb_set, "", - "Set environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("help", kdb_help, "", - "Display Help Message", 1, KDB_REPEAT_NONE); - kdb_register_repeat("?", kdb_help, "", - "Display Help Message", 0, KDB_REPEAT_NONE); - kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", - "Switch to new cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", - "Display active task list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("pid", kdb_pid, "<pidnum>", - "Switch to another task", 0, KDB_REPEAT_NONE); - kdb_register_repeat("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, KDB_REPEAT_NONE); + KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); + kdb_register_flags("env", kdb_env, "", + "Show environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("set", kdb_set, "", + "Set environment variables", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("help", kdb_help, "", + "Display Help Message", 1, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("?", kdb_help, "", + "Display Help Message", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("cpu", kdb_cpu, "<cpunum>", + "Switch to new cpu", 0, + KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); + kdb_register_flags("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, 0); + kdb_register_flags("ps", kdb_ps, "[<flags>|A]", + "Display active task list", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("pid", kdb_pid, "<pidnum>", + "Switch to another task", 0, + KDB_ENABLE_INSPECT); + kdb_register_flags("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, + KDB_ENABLE_REBOOT); #if defined(CONFIG_MODULES) - kdb_register_repeat("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, KDB_REPEAT_NONE); + kdb_register_flags("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, + KDB_ENABLE_INSPECT); #endif #if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_repeat("sr", kdb_sr, "<key>", - "Magic SysRq key", 0, KDB_REPEAT_NONE); + kdb_register_flags("sr", kdb_sr, "<key>", + "Magic SysRq key", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif #if defined(CONFIG_PRINTK) - kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, KDB_REPEAT_NONE); + kdb_register_flags("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, + KDB_ENABLE_ALWAYS_SAFE); #endif if (arch_kgdb_ops.enable_nmi) { - kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", - "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); - } - kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", - "Send a signal to a process", 0, KDB_REPEAT_NONE); - kdb_register_repeat("summary", kdb_summary, "", - "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", - "Display per_cpu variables", 3, KDB_REPEAT_NONE); - kdb_register_repeat("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, KDB_REPEAT_NONE); + kdb_register_flags("disable_nmi", kdb_disable_nmi, "", + "Disable NMI entry to KDB", 0, + KDB_ENABLE_ALWAYS_SAFE); + } + kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", + "Send a signal to a process", 0, + KDB_ENABLE_SIGNAL); + kdb_register_flags("summary", kdb_summary, "", + "Summarize the system", 4, + KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", + "Display per_cpu variables", 3, + KDB_ENABLE_MEM_READ); + kdb_register_flags("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, + KDB_ENABLE_ALWAYS_SAFE); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..75014d7f4568 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -172,10 +172,9 @@ typedef struct _kdbtab { kdb_func_t cmd_func; /* Function to execute command */ char *cmd_usage; /* Usage String for this command */ char *cmd_help; /* Help message for this command */ - short cmd_flags; /* Parsing flags */ short cmd_minlen; /* Minimum legal # command * chars required */ - kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ } kdbtab_t; extern int kdb_bt(int, const char **); /* KDB display back trace */ @@ -197,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, /* Miscellaneous functions and data areas */ extern int kdb_grepping_flag; +#define KDB_GREPPING_FLAG_SEARCH 0x8000 extern char kdb_grep_string[]; +#define KDB_GREP_STRLEN 256 extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; @@ -210,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -extern char *kdb_getstr(char *, size_t, char *); +extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b71e66d..ef90b04d783f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk) } /* - * Start accounting for a delay statistic using - * its starting timestamp (@start) + * Finish delay accounting for a statistic using its timestamps (@start), + * accumalator (@total) and @count */ - -static inline void delayacct_start(struct timespec *start) +static void delayacct_end(u64 *start, u64 *total, u32 *count) { - do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) -{ - struct timespec ts; - s64 ns; + s64 ns = ktime_get_ns() - *start; unsigned long flags; - do_posix_clock_monotonic_gettime(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + if (ns > 0) { + spin_lock_irqsave(¤t->delays->lock, flags); + *total += ns; + (*count)++; + spin_unlock_irqrestore(¤t->delays->lock, flags); + } } void __delayacct_blkio_start(void) { - delayacct_start(¤t->delays->blkio_start); + current->delays->blkio_start = ktime_get_ns(); } void __delayacct_blkio_end(void) @@ -89,35 +72,29 @@ void __delayacct_blkio_end(void) if (current->delays->flags & DELAYACCT_PF_SWAPIN) /* Swapin block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->swapin_delay, ¤t->delays->swapin_count); else /* Other block I/O */ delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; + unsigned long long t2, t3; + unsigned long flags, t1; + s64 tmp; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); - cputime_to_timespec(utime + stime, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_run_real_total; + tmp += cputime_to_nsecs(utime + stime); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - tmp = (s64)d->cpu_scaled_run_real_total; task_cputime_scaled(tsk, &utimescaled, &stimescaled); - cputime_to_timespec(utimescaled + stimescaled, &ts); - tmp += timespec_to_ns(&ts); + tmp = (s64)d->cpu_scaled_run_real_total; + tmp += cputime_to_nsecs(utimescaled + stimescaled); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; @@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) void __delayacct_freepages_start(void) { - delayacct_start(¤t->delays->freepages_start); + current->delays->freepages_start = ktime_get_ns(); } void __delayacct_freepages_end(void) { delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, ¤t->delays->freepages_delay, ¤t->delays->freepages_count); } diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 103f5d147b2f..2925188f50ea 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = -pg +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif obj-y := core.o ring_buffer.o callchain.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..d659487254d5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -52,7 +52,7 @@ static void release_callchain_buffers(void) struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); + RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) int cpu; struct callchain_cpus_entries *entries; - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) static void put_callchain_entry(int rctx) { - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b17ac1b0c2a..81aa3a4ece9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -34,18 +34,23 @@ #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> +#include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/mman.h> +#include <linux/compat.h> +#include <linux/bpf.h> +#include <linux/filter.h> #include "internal.h" #include <asm/irq_regs.h> +static struct workqueue_struct *perf_wq; + struct remote_function_call { struct task_struct *p; int (*func)(void *info); @@ -119,6 +124,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define EVENT_OWNER_KERNEL ((void *) -1) + +static bool is_kernel_event(struct perf_event *event) +{ + return event->owner == EVENT_OWNER_KERNEL; +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -143,7 +155,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -239,7 +251,7 @@ static void perf_duration_warn(struct irq_work *w) u64 avg_local_sample_len; u64 local_samples_len; - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; printk_ratelimited(KERN_WARNING @@ -261,10 +273,10 @@ void perf_sample_event_took(u64 sample_len_ns) return; /* decay the counter by 1 average sample */ - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; local_samples_len += sample_len_ns; - __get_cpu_var(running_sample_length) = local_samples_len; + __this_cpu_write(running_sample_length, local_samples_len); /* * note: this will be biased artifically low until we have @@ -317,6 +329,11 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline u64 perf_event_clock(struct perf_event *event) +{ + return event->clock(); +} + static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { @@ -341,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info __percpu *info; -}; - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_css(task, perf_event_cgrp_id), - struct perf_cgroup, css); -} - static inline bool perf_cgroup_match(struct perf_event *event) { @@ -391,14 +382,9 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - static inline void perf_detach_cgroup(struct perf_event *event) { - perf_put_cgroup(event); + css_put(&event->cgrp->css); event->cgrp = NULL; } @@ -609,7 +595,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_dentry, + css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -867,22 +853,32 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, rotation_list); +static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. + * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and + * perf_event_task_tick() are fully serialized because they're strictly cpu + * affine and perf_event_ctx{activate,deactivate} are called with IRQs + * disabled, while perf_event_task_tick is called from IRQ context. */ -static void perf_pmu_rotate_start(struct pmu *pmu) +static void perf_event_ctx_activate(struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&active_ctx_list); WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); + WARN_ON(!list_empty(&ctx->active_ctx_list)); + + list_add(&ctx->active_ctx_list, head); +} + +static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +{ + WARN_ON(!irqs_disabled()); + + WARN_ON(list_empty(&ctx->active_ctx_list)); + + list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) @@ -890,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx->task_ctx_data); + kfree(ctx); +} + static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -897,17 +902,105 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - kfree_rcu(ctx, rcu_head); + call_rcu(&ctx->rcu_head, free_ctx); } } -static void unclone_ctx(struct perf_event_context *ctx) +/* + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and + * perf_pmu_migrate_context() we need some magic. + * + * Those places that change perf_event::ctx will hold both + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. + * + * Lock ordering is by mutex address. There is one other site where + * perf_event_context::mutex nests and that is put_event(). But remember that + * that is a parent<->child context relation, and migration does not affect + * children, therefore these two orderings should not interact. + * + * The change in perf_event::ctx does not affect children (as claimed above) + * because the sys_perf_event_open() case will install a new event and break + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only + * concerned with cpuctx and that doesn't have children. + * + * The places that change perf_event::ctx will issue: + * + * perf_remove_from_context(); + * synchronize_rcu(); + * perf_install_in_context(); + * + * to affect the change. The remove_from_context() + synchronize_rcu() should + * quiesce the event, after which we can install it in the new location. This + * means that only external vectors (perf_fops, prctl) can perturb the event + * while in transit. Therefore all such accessors should also acquire + * perf_event_context::mutex to serialize against this. + * + * However; because event->ctx can change while we're waiting to acquire + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() + * function. + * + * Lock order: + * task_struct::perf_event_mutex + * perf_event_context::mutex + * perf_event_context::lock + * perf_event::child_mutex; + * perf_event::mmap_mutex + * mmap_sem + */ +static struct perf_event_context * +perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; + struct perf_event_context *ctx; + +again: + rcu_read_lock(); + ctx = ACCESS_ONCE(event->ctx); + if (!atomic_inc_not_zero(&ctx->refcount)) { + rcu_read_unlock(); + goto again; } + rcu_read_unlock(); + + mutex_lock_nested(&ctx->mutex, nesting); + if (event->ctx != ctx) { + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + + return ctx; +} + +static inline struct perf_event_context * +perf_event_ctx_lock(struct perf_event *event) +{ + return perf_event_ctx_lock_nested(event, 0); +} + +static void perf_event_ctx_unlock(struct perf_event *event, + struct perf_event_context *ctx) +{ + mutex_unlock(&ctx->mutex); + put_ctx(ctx); +} + +/* + * This must be done under the ctx->lock, such as to serialize against + * context_equiv(), therefore we cannot call put_ctx() since that might end up + * calling scheduler related locks and ctx->lock nests inside those. + */ +static __must_check struct perf_event_context * +unclone_ctx(struct perf_event_context *ctx) +{ + struct perf_event_context *parent_ctx = ctx->parent_ctx; + + lockdep_assert_held(&ctx->lock); + + if (parent_ctx) + ctx->parent_ctx = NULL; ctx->generation++; + + return parent_ctx; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,12 +1229,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; - if (has_branch_stack(event)) - ctx->nr_branch_stack++; - list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1260,6 +1348,8 @@ static void perf_group_attach(struct perf_event *event) if (group_leader == event) return; + WARN_ON_ONCE(group_leader->ctx != event->ctx); + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && !is_software_event(event)) group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; @@ -1281,6 +1371,10 @@ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * We can have double detach due to exit/hot-unplug + close. */ @@ -1301,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } - if (has_branch_stack(event)) - ctx->nr_branch_stack--; - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -1365,6 +1456,8 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; + + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: @@ -1374,6 +1467,45 @@ out: perf_event__header_size(tmp); } +/* + * User event without the task. + */ +static bool is_orphaned_event(struct perf_event *event) +{ + return event && !is_kernel_event(event) && !event->owner; +} + +/* + * Event has a parent but parent's task finished and it's + * alive only because of children holding refference. + */ +static bool is_orphaned_child(struct perf_event *event) +{ + return is_orphaned_event(event->parent); +} + +static void orphans_remove_work(struct work_struct *work); + +static void schedule_orphans_remove(struct perf_event_context *ctx) +{ + if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) + return; + + if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { + get_ctx(ctx); + ctx->orphans_remove_sched = true; + } +} + +static int __init perf_workqueue_init(void) +{ + perf_wq = create_singlethread_workqueue("perf"); + WARN(!perf_wq, "failed to create perf workqueue\n"); + return perf_wq ? 0 : -1; +} + +core_initcall(perf_workqueue_init); + static inline int event_filter_match(struct perf_event *event) { @@ -1388,6 +1520,10 @@ event_sched_out(struct perf_event *event, { u64 tstamp = perf_event_time(event); u64 delta; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * An event which could not be activated because of * filter mismatch still needs to have its timings @@ -1417,12 +1553,16 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; - ctx->nr_active--; + if (!--ctx->nr_active) + perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + perf_pmu_enable(event->pmu); } @@ -1505,8 +1645,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group if (!task) { /* - * Per cpu events are removed via an smp call and - * the removal is always successful. + * Per cpu events are removed via an smp call. The removal can + * fail if the CPU is currently offline, but in that case we + * already called __perf_remove_from_context from + * perf_event_exit_cpu. */ cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; @@ -1523,6 +1665,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -1590,7 +1737,7 @@ int __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -void perf_event_disable(struct perf_event *event) +static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1631,6 +1778,19 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } + +/* + * Strictly speaking kernel users cannot create groups and therefore this + * interface does not need the perf_event_ctx_lock() magic. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_disable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, @@ -1671,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); +static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, @@ -1705,6 +1866,12 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + perf_log_itrace_start(event); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -1712,19 +1879,19 @@ event_sched_in(struct perf_event *event, goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - if (!is_software_event(event)) cpuctx->active_oncpu++; - ctx->nr_active++; + if (!ctx->nr_active++) + perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + out: perf_pmu_enable(event->pmu); @@ -1966,6 +2133,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -2086,7 +2258,7 @@ unlock: * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -void perf_event_enable(struct perf_event *event) +static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -2142,9 +2314,21 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } + +/* + * See perf_event_disable(); + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_enable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_enable); -int perf_event_refresh(struct perf_event *event, int refresh) +static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -2153,10 +2337,25 @@ int perf_event_refresh(struct perf_event *event, int refresh) return -EINVAL; atomic_add(refresh, &event->event_limit); - perf_event_enable(event); + _perf_event_enable(event); return 0; } + +/* + * See perf_event_disable() + */ +int perf_event_refresh(struct perf_event *event, int refresh) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_refresh(event, refresh); + perf_event_ctx_unlock(event, ctx); + + return ret; +} EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, @@ -2199,6 +2398,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { + lockdep_assert_held(&ctx1->lock); + lockdep_assert_held(&ctx2->lock); + /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; @@ -2320,7 +2522,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ - if (!parent || !next_parent) + if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { @@ -2344,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; + + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2362,6 +2567,56 @@ unlock: } } +void perf_sched_cb_dec(struct pmu *pmu) +{ + this_cpu_dec(perf_sched_cb_usages); +} + +void perf_sched_cb_inc(struct pmu *pmu) +{ + this_cpu_inc(perf_sched_cb_usages); +} + +/* + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when the context switch callback is enabled. + */ +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (prev == next) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->sched_task) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2381,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2389,7 +2647,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); } @@ -2537,70 +2795,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); -} - -/* - * When sampling the branck stack in system-wide, it may be necessary - * to flush the stack on context switch. This happens when the branch - * stack does not tag its entries with the pid of the current task. - * Otherwise it becomes impossible to associate a branch entry with a - * task. This ambiguity is more likely to appear when the branch stack - * supports priv level filtering and the user sets it to monitor only - * at the user level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch stack with - * branch from multiple tasks. Flushing may mean dropping the existing - * entries or stashing them somewhere in the PMU specific code layer. - * - * This function provides the context switch callback to the lower code - * layer. It is invoked ONLY when there is at least one system-wide context - * with at least one active event using taken branch sampling. - */ -static void perf_branch_stack_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* no need to flush branch stack if not changing task */ - if (prev == task) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - /* - * check if the context has at least one - * event using PERF_SAMPLE_BRANCH_STACK - */ - if (cpuctx->ctx.nr_branch_stack > 0 - && pmu->flush_branch_stack) { - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - - perf_pmu_disable(pmu); - - pmu->flush_branch_stack(); - - perf_pmu_enable(pmu); - - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - } - - rcu_read_unlock(); - - local_irq_restore(flags); } /* @@ -2632,12 +2826,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, * to check if we have to switch in PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); - /* check for system-wide branch_stack events */ - if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) - perf_branch_stack_sched_in(prev, task); + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2830,25 +3023,18 @@ static void rotate_ctx(struct perf_event_context *ctx) list_rotate_left(&ctx->flexible_groups); } -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0; if (cpuctx->ctx.nr_events) { - remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } @@ -2872,8 +3058,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: - if (remove) - list_del_init(&cpuctx->rotation_list); return rotate; } @@ -2891,9 +3075,8 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = &__get_cpu_var(rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; + struct list_head *head = this_cpu_ptr(&active_ctx_list); + struct perf_event_context *ctx, *tmp; int throttled; WARN_ON(!irqs_disabled()); @@ -2901,14 +3084,8 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; + list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - } } static int event_enable_on_exec(struct perf_event *event, @@ -2932,6 +3109,7 @@ static int event_enable_on_exec(struct perf_event *event, */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { + struct perf_event_context *clone_ctx = NULL; struct perf_event *event; unsigned long flags; int enabled = 0; @@ -2963,7 +3141,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * Unclone this context if we enabled any event. */ if (enabled) - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); raw_spin_unlock(&ctx->lock); @@ -2973,6 +3151,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); + + if (clone_ctx) + put_ctx(clone_ctx); } void perf_event_exec(void) @@ -3023,7 +3204,10 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return local64_read(&event->count) + atomic64_read(&event->child_count); + if (event->pmu->count) + return event->pmu->count(event); + + return __perf_event_count(event); } static u64 perf_event_read(struct perf_event *event) @@ -3063,10 +3247,12 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); + INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3122,12 +3308,15 @@ errout: * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +find_get_context(struct pmu *pmu, struct task_struct *task, + struct perf_event *event) { - struct perf_event_context *ctx; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; + void *task_ctx_data = NULL; unsigned long flags; int ctxn, err; + int cpu = event->cpu; if (!task) { /* Must be root to operate on a CPU event: */ @@ -3155,18 +3344,39 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (ctxn < 0) goto errout; + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + if (!task_ctx_data) { + err = -ENOMEM; + goto errout; + } + } + retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; + + if (task_ctx_data && !ctx->task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } raw_spin_unlock_irqrestore(&ctx->lock, flags); + + if (clone_ctx) + put_ctx(clone_ctx); } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; + if (task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } + err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -3193,13 +3403,16 @@ retry: } } + kfree(task_ctx_data); return ctx; errout: + kfree(task_ctx_data); return ERR_PTR(err); } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3209,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); @@ -3221,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } @@ -3252,6 +3461,91 @@ static void unaccount_event(struct perf_event *event) unaccount_event_cpu(event, event->cpu); } +/* + * The following implement mutual exclusion of events on "exclusive" pmus + * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled + * at a time, so we disallow creating events that might conflict, namely: + * + * 1) cpu-wide events in the presence of per-task events, + * 2) per-task events in the presence of cpu-wide events, + * 3) two matching events on the same context. + * + * The former two cases are handled in the allocation path (perf_event_alloc(), + * __free_event()), the latter -- before the first perf_install_in_context(). + */ +static int exclusive_event_init(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return 0; + + /* + * Prevent co-existence of per-task and cpu-wide events on the + * same exclusive pmu. + * + * Negative pmu::exclusive_cnt means there are cpu-wide + * events on this "exclusive" pmu, positive means there are + * per-task events. + * + * Since this is called in perf_event_alloc() path, event::ctx + * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK + * to mean "per-task event", because unlike other attach states it + * never gets cleared. + */ + if (event->attach_state & PERF_ATTACH_TASK) { + if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) + return -EBUSY; + } else { + if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) + return -EBUSY; + } + + return 0; +} + +static void exclusive_event_destroy(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return; + + /* see comment in exclusive_event_init() */ + if (event->attach_state & PERF_ATTACH_TASK) + atomic_dec(&pmu->exclusive_cnt); + else + atomic_inc(&pmu->exclusive_cnt); +} + +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) +{ + if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + (e1->cpu == e2->cpu || + e1->cpu == -1 || + e2->cpu == -1)) + return true; + return false; +} + +/* Called under the same ctx::mutex as perf_install_in_context() */ +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *iter_event; + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return true; + + list_for_each_entry(iter_event, &ctx->event_list, event_entry) { + if (exclusive_event_match(iter_event, event)) + return false; + } + + return true; +} + static void __free_event(struct perf_event *event) { if (!event->parent) { @@ -3265,8 +3559,10 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) + if (event->pmu) { + exclusive_event_destroy(event); module_put(event->pmu->module); + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -3312,16 +3608,12 @@ static void free_event(struct perf_event *event) } /* - * Called when the last reference to the file is gone. + * Remove user event from the owner task. */ -static void put_event(struct perf_event *event) +static void perf_remove_from_owner(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; struct task_struct *owner; - if (!atomic_long_dec_and_test(&event->refcount)) - return; - rcu_read_lock(); owner = ACCESS_ONCE(event->owner); /* @@ -3342,7 +3634,16 @@ static void put_event(struct perf_event *event) rcu_read_unlock(); if (owner) { - mutex_lock(&owner->perf_event_mutex); + /* + * If we're here through perf_event_exit_task() we're already + * holding ctx->mutex which would be an inversion wrt. the + * normal lock order. + * + * However we can safely take this lock because its the child + * ctx->mutex. + */ + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); + /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex @@ -3354,8 +3655,21 @@ static void put_event(struct perf_event *event) mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } +} + +/* + * Called when the last reference to the file is gone. + */ +static void put_event(struct perf_event *event) +{ + struct perf_event_context *ctx; + + if (!atomic_long_dec_and_test(&event->refcount)) + return; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: * @@ -3368,9 +3682,10 @@ static void put_event(struct perf_event *event) * the last filedesc died, so there is no possibility * to trigger the AB-BA case. */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); + perf_event_ctx_unlock(event, ctx); _free_event(event); } @@ -3388,6 +3703,42 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +/* + * Remove all orphanes events from the context. + */ +static void orphans_remove_work(struct work_struct *work) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + + ctx = container_of(work, struct perf_event_context, + orphans_remove.work); + + mutex_lock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + struct perf_event *parent_event = event->parent; + + if (!is_orphaned_child(event)) + continue; + + perf_remove_from_context(event, true); + + mutex_lock(&parent_event->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent_event->child_mutex); + + free_event(event); + put_event(parent_event); + } + + raw_spin_lock_irq(&ctx->lock); + ctx->orphans_remove_sched = false; + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -3418,12 +3769,13 @@ static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; struct perf_event_context *ctx = leader->ctx; - u64 values[5]; + int n = 0, size = 0, ret; u64 count, enabled, running; + u64 values[5]; + + lockdep_assert_held(&ctx->mutex); - mutex_lock(&ctx->mutex); count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; @@ -3438,7 +3790,7 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - goto unlock; + return -EFAULT; ret = size; @@ -3452,14 +3804,11 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; + return -EFAULT; } ret += size; } -unlock: - mutex_unlock(&ctx->mutex); return ret; } @@ -3485,6 +3834,19 @@ static int perf_event_read_one(struct perf_event *event, return n * sizeof(u64); } +static bool is_event_hup(struct perf_event *event) +{ + bool no_children; + + if (event->state != PERF_EVENT_STATE_EXIT) + return false; + + mutex_lock(&event->child_mutex); + no_children = list_empty(&event->child_list); + mutex_unlock(&event->child_mutex); + return no_children; +} + /* * Read the performance event - simple non blocking version for now */ @@ -3518,15 +3880,26 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = perf_read_hw(event, buf, count); + perf_event_ctx_unlock(event, ctx); - return perf_read_hw(event, buf, count); + return ret; } static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLL_HUP; + unsigned int events = POLLHUP; + + poll_wait(file, &event->waitq, wait); + + if (is_event_hup(event)) + return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise @@ -3537,13 +3910,10 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); - - poll_wait(file, &event->waitq, wait); - return events; } -static void perf_event_reset(struct perf_event *event) +static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); local64_set(&event->count, 0); @@ -3562,6 +3932,7 @@ static void perf_event_for_each_child(struct perf_event *event, struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) @@ -3575,14 +3946,13 @@ static void perf_event_for_each(struct perf_event *event, struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + lockdep_assert_held(&ctx->mutex); + event = event->group_leader; perf_event_for_each_child(event, func); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); - mutex_unlock(&ctx->mutex); } static int perf_event_period(struct perf_event *event, u64 __user *arg) @@ -3651,26 +4021,26 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { - struct perf_event *event = file->private_data; void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; + func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; + func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: - func = perf_event_reset; + func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); + return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); @@ -3705,6 +4075,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -3717,13 +4090,50 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + long ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_ioctl(event, cmd, arg); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + case _IOC_NR(PERF_EVENT_IOC_ID): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_enable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3731,11 +4141,15 @@ int perf_event_task_enable(void) int perf_event_task_disable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_disable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3780,12 +4194,15 @@ static void perf_event_init_userpage(struct perf_event *event) /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + userpg->data_offset = PAGE_SIZE; + userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } -void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage( + struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } @@ -3835,7 +4252,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - arch_perf_update_userpage(userpg, now); + arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; @@ -3946,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } -static struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -3961,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct ring_buffer *rb) { if (!atomic_dec_and_test(&rb->refcount)) return; @@ -3977,6 +4394,12 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + + if (vma->vm_pgoff) + atomic_inc(&event->rb->aux_mmap_count); + + if (event->pmu->event_mapped) + event->pmu->event_mapped(event); } /* @@ -3996,6 +4419,23 @@ static void perf_mmap_close(struct vm_area_struct *vma) int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + if (event->pmu->event_unmapped) + event->pmu->event_unmapped(event); + + /* + * rb->aux_mmap_count will always drop before rb->mmap_count and + * event->mmap_count, so it is ok to use event->mmap_mutex to + * serialize with perf_mmap here. + */ + if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + + rb_free_aux(rb); + mutex_unlock(&event->mmap_mutex); + } + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4069,7 +4509,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, + .close = perf_mmap_close, /* non mergable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -4080,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct ring_buffer *rb; + struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra, extra; + long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -4098,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (vma->vm_pgoff == 0) { + nr_pages = (vma_size / PAGE_SIZE) - 1; + } else { + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + u64 aux_offset, aux_size; + + if (!event->rb) + return -EINVAL; + + nr_pages = vma_size / PAGE_SIZE; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + rb = event->rb; + if (!rb) + goto aux_unlock; + + aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); + aux_size = ACCESS_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + goto aux_unlock; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + goto aux_unlock; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + goto aux_unlock; + + if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + goto aux_unlock; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + goto aux_unlock; + + if (!is_power_of_2(nr_pages)) + goto aux_unlock; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + goto aux_unlock; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + ret = 0; + goto unlock; + } + + atomic_set(&rb->aux_mmap_count, 1); + user_extra = nr_pages; + + goto accounting; + } /* * If we have rb pages ensure they're a power-of-two number, so we @@ -4110,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - if (vma->vm_pgoff != 0) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -4136,6 +4632,8 @@ again: } user_extra = nr_pages + 1; + +accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -4145,7 +4643,6 @@ again: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - extra = 0; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -4159,35 +4656,46 @@ again: goto unlock; } - WARN_ON(event->rb); + WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - if (!rb) { - ret = -ENOMEM; - goto unlock; - } + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); - atomic_set(&rb->mmap_count, 1); - rb->mmap_locked = extra; - rb->mmap_user = get_current_user(); + if (!rb) { + ret = -ENOMEM; + goto unlock; + } - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - ring_buffer_attach(event, rb); + ring_buffer_attach(event, rb); - perf_event_init_userpage(event); - perf_event_update_userpage(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); + if (!ret) + rb->aux_mmap_locked = extra; + } unlock: - if (!ret) + if (!ret) { + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->pinned_vm += extra; + atomic_inc(&event->mmap_count); + } else if (rb) { + atomic_dec(&rb->mmap_count); + } +aux_unlock: mutex_unlock(&event->mmap_mutex); /* @@ -4197,6 +4705,9 @@ unlock: vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; + if (event->pmu->event_mapped) + event->pmu->event_mapped(event); + return ret; } @@ -4222,7 +4733,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; @@ -4248,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -4258,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* @@ -4296,22 +4817,29 @@ perf_output_sample_regs(struct perf_output_handle *handle, } } -static void perf_sample_regs_user(struct perf_regs_user *regs_user, - struct pt_regs *regs) +static void perf_sample_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) { - if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - regs_user->abi = perf_reg_abi(current); + } else if (current->mm) { + perf_get_regs_user(regs_user, regs, regs_user_copy); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; } } +static void perf_sample_regs_intr(struct perf_regs *regs_intr, + struct pt_regs *regs) +{ + regs_intr->regs = regs; + regs_intr->abi = perf_reg_abi(current); +} + + /* * Get remaining task size from user stack pointer. * @@ -4423,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); + data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); @@ -4693,6 +5221,23 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); + if (sample_type & PERF_SAMPLE_REGS_INTR) { + u64 abi = data->regs_intr.abi; + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_intr; + + perf_output_sample_regs(handle, + data->regs_intr.regs, + mask); + } + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4758,12 +5303,14 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + perf_sample_regs_user(&data->regs_user, regs, + &data->regs_user_copy); + if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); - perf_sample_regs_user(&data->regs_user, regs); - if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); @@ -4779,15 +5326,11 @@ void perf_prepare_sample(struct perf_event_header *header, * in case new sample type is added, because we could eat * up the rest of the sample size. */ - struct perf_regs_user *uregs = &data->regs_user; u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); - if (!uregs->abi) - perf_sample_regs_user(uregs, regs); - stack_size = perf_sample_ustack_size(stack_size, header->size, - uregs->regs); + data->regs_user.regs); /* * If there is something to dump, add space for the dump @@ -4800,6 +5343,21 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_INTR) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_intr(&data->regs_intr, regs); + + if (data->regs_intr.regs) { + u64 mask = event->attr.sample_regs_intr; + + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -4971,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); + task_event->event_id.time = perf_event_clock(event); + perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); @@ -5004,7 +5564,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ - .time = perf_clock(), + /* .time */ }, }; @@ -5266,6 +5826,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { + if (vma->vm_ops && vma->vm_ops->name) { + name = (char *) vma->vm_ops->name(vma); + if (name) + goto cpy_name; + } + name = (char *)arch_vma_name(vma); if (name) goto cpy_name; @@ -5353,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma) perf_event_mmap_event(&mmap_event); } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 offset; + u64 size; + u64 flags; + } rec = { + .header = { + .type = PERF_RECORD_AUX, + .misc = 0, + .size = sizeof(rec), + }, + .offset = head, + .size = size, + .flags = flags, + }; + int ret; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * IRQ throttle logging */ @@ -5374,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = perf_clock(), + .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; @@ -5394,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +static void perf_log_itrace_start(struct perf_event *event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u32 pid; + u32 tid; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || + event->hw.itrace_started) + return; + + event->hw.itrace_started = 1; + + rec.header.type = PERF_RECORD_ITRACE_START; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.pid = perf_event_pid(event, current); + rec.tid = perf_event_tid(event, current); + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * Generic event overflow handling, sampling. */ @@ -5665,7 +6303,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; @@ -5682,9 +6320,11 @@ end: rcu_read_unlock(); } +DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); + int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } @@ -5692,26 +6332,35 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); inline void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; - int rctx; - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) + if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); +} + +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (unlikely(rctx < 0)) + goto fail; + + ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); +fail: preempt_enable_notrace(); } @@ -5721,7 +6370,7 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_add(struct perf_event *event, int flags) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; @@ -5743,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) } hlist_add_head_rcu(&event->hlist_entry, head); + perf_event_update_userpage(event); return 0; } @@ -5777,7 +6427,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) if (!hlist) return; - rcu_assign_pointer(swhash->swevent_hlist, NULL); + RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } @@ -5903,22 +6553,17 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6036,8 +6681,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6068,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6083,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -6221,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6257,14 +6952,14 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6297,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6337,14 +7033,14 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6374,7 +7070,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* @@ -6582,12 +7278,10 @@ skip_type: __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; __perf_cpu_hrtimer_init(cpuctx, cpu); - INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -6618,6 +7312,7 @@ got_cpu_context: pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -6660,6 +7355,31 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx = NULL; + int ret; + + if (!try_module_get(pmu->module)) + return -ENODEV; + + if (event->group_leader != event) { + ctx = perf_event_ctx_lock(event->group_leader); + BUG_ON(!ctx); + } + + event->pmu = pmu; + ret = pmu->event_init(event); + + if (ctx) + perf_event_ctx_unlock(event->group_leader, ctx); + + if (ret) + module_put(pmu->module); + + return ret; +} + struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; @@ -6672,24 +7392,14 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; @@ -6710,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } @@ -6752,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, - void *context) + void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; @@ -6807,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif + event->hw.target = task; } + event->clock = &local_clock; + if (parent_event) + event->clock = parent_event->clock; + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; @@ -6845,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto err_ns; + if (!has_branch_stack(event)) + event->attr.branch_sample_type = 0; + + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_ns; + } + pmu = perf_init_event(event); if (!pmu) goto err_ns; @@ -6853,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + err = exclusive_event_init(event); + if (err) + goto err_pmu; + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_pmu; + goto err_per_task; } } return event; +err_per_task: + exclusive_event_destroy(event); + err_pmu: if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); kfree(event); @@ -6994,6 +7718,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + ret = perf_reg_validate(attr->sample_regs_intr); out: return ret; @@ -7028,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; + /* + * Mixing clocks in the same buffer is trouble you don't need. + */ + if (output_event->clock != event->clock) + goto out; + + /* + * If both events generate aux data, they must be on the same PMU + */ + if (has_aux(event) && has_aux(output_event) && + event->pmu != output_event->pmu) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ @@ -7051,6 +7790,52 @@ out: return ret; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + +static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) +{ + bool nmi_safe = false; + + switch (clk_id) { + case CLOCK_MONOTONIC: + event->clock = &ktime_get_mono_fast_ns; + nmi_safe = true; + break; + + case CLOCK_MONOTONIC_RAW: + event->clock = &ktime_get_raw_fast_ns; + nmi_safe = true; + break; + + case CLOCK_REALTIME: + event->clock = &ktime_get_real_ns; + break; + + case CLOCK_BOOTTIME: + event->clock = &ktime_get_boot_ns; + break; + + case CLOCK_TAI: + event->clock = &ktime_get_tai_ns; + break; + + default: + return -EINVAL; + } + + if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) + return -EINVAL; + + return 0; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7066,7 +7851,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -7075,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open, int move_group = 0; int err; int f_flags = O_RDWR; + int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7140,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (flags & PERF_FLAG_PID_CGROUP) + cgroup_fd = pid; + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, - NULL, NULL); + NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_cpus; } - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) { - __free_event(event); - goto err_cpus; - } - } - if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -ENOTSUPP; @@ -7170,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = event->pmu; + if (attr.use_clockid) { + err = perf_event_set_clock(event, attr.clockid); + if (err) + goto err_alloc; + } + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -7196,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event->cpu); + ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } + if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { + err = -EBUSY; + goto err_context; + } + if (task) { put_task_struct(task); task = NULL; @@ -7219,12 +8011,29 @@ SYSCALL_DEFINE5(perf_event_open, */ if (group_leader->group_leader != group_leader) goto err_context; + + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + goto err_context; + /* * Do not allow to attach to a group in a different * task or CPU context: */ if (move_group) { - if (group_leader->ctx->type != ctx->type) + /* + * Make sure we're both on the same task, or both + * per-cpu events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) goto err_context; } else { if (group_leader->ctx != ctx) @@ -7252,43 +8061,75 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader, false); + gctx = group_leader->ctx; /* - * Removing from the context ends up with disabled - * event. What we want here is event in the initial - * startup state, ready to be add into new context. + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. */ - perf_event__state_init(group_leader); + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + perf_remove_from_context(group_leader, false); + list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling, false); - perf_event__state_init(sibling); put_ctx(gctx); } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); + } else { + mutex_lock(&ctx->mutex); } WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); if (move_group) { + /* + * Wait for everybody to stop referencing the events through + * the old lists, before installing it on new lists. + */ synchronize_rcu(); - perf_install_in_context(ctx, group_leader, event->cpu); - get_ctx(ctx); + + /* + * Install the group siblings before the group leader. + * + * Because a group leader will try and install the entire group + * (through the sibling list, which is still in-tact), we can + * end up with siblings installed in the wrong context. + * + * By installing siblings first we NO-OP because they're not + * reachable through the group lists. + */ list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, event->cpu); + perf_event__state_init(sibling); + perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); + perf_install_in_context(ctx, group_leader, group_leader->cpu); + get_ctx(ctx); + } + + if (!exclusive_event_installable(event, ctx)) { + err = -EBUSY; + mutex_unlock(&ctx->mutex); + fput(event_file); + goto err_context; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); + + if (move_group) { + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -7354,15 +8195,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, */ event = perf_event_alloc(attr, cpu, task, NULL, NULL, - overflow_handler, context); + overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } + /* Mark owner so we could distinguish it from user events. */ + event->owner = EVENT_OWNER_KERNEL; + account_event(event); - ctx = find_get_context(event->pmu, task, cpu); + ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; @@ -7370,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (!exclusive_event_installable(event, ctx)) { + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); + err = -EBUSY; + goto err_free; + } + perf_install_in_context(ctx, event, cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7393,7 +8245,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; - mutex_lock(&src_ctx->mutex); + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event, false); @@ -7401,11 +8257,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) put_ctx(src_ctx); list_add(&event->migrate_entry, &events); } - mutex_unlock(&src_ctx->mutex); + /* + * Wait for the events to quiesce before re-instating them. + */ synchronize_rcu(); - mutex_lock(&dst_ctx->mutex); + /* + * Re-instate events in 2 passes. + * + * Skip over group leaders and only install siblings on this first + * pass, siblings will not get enabled without a leader, however a + * leader will enable its siblings, even if those are still on the old + * context. + */ + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + if (event->group_leader == event) + continue; + + list_del(&event->migrate_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + + /* + * Once all the siblings are setup properly, install the group leaders + * to make it go. + */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) @@ -7415,6 +8296,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) get_ctx(dst_ctx); } mutex_unlock(&dst_ctx->mutex); + mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); @@ -7447,6 +8329,12 @@ static void sync_child_event(struct perf_event *child_event, mutex_unlock(&parent_event->child_mutex); /* + * Make sure user/parent get notified, that we just + * lost one event. + */ + perf_event_wakeup(parent_event); + + /* * Release the parent event, if this was the last * reference to it. */ @@ -7480,13 +8368,16 @@ __perf_event_exit_task(struct perf_event *child_event, if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); + } else { + child_event->state = PERF_EVENT_STATE_EXIT; + perf_event_wakeup(child_event); } } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7513,28 +8404,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) child->perf_event_ctxp[ctxn] = NULL; /* - * In order to avoid freeing: child_ctx->parent_ctx->task - * under perf_event_context::lock, grab another reference. - */ - parent_ctx = child_ctx->parent_ctx; - if (parent_ctx) - get_ctx(parent_ctx); - - /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the events from it. */ - unclone_ctx(child_ctx); + clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Now that we no longer hold perf_event_context::lock, drop - * our extra child_ctx->parent_ctx reference. - */ - if (parent_ctx) - put_ctx(parent_ctx); + if (clone_ctx) + put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we @@ -7604,14 +8483,19 @@ static void perf_free_event(struct perf_event *event, put_event(parent); + raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* - * free an unexposed, unused context as created by inheritance by + * Free an unexposed, unused context as created by inheritance by * perf_event_init_task below, used by fork() in case of fail. + * + * Not all locks are strictly required, but take them anyway to be nice and + * help out with the lockdep assertions. */ void perf_event_free_task(struct task_struct *task) { @@ -7663,6 +8547,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { + enum perf_event_active_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; @@ -7679,11 +8564,12 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL, NULL); + NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; - if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + if (is_orphaned_event(parent_event) || + !atomic_long_inc_not_zero(&parent_event->refcount)) { free_event(child_event); return NULL; } @@ -7695,7 +8581,7 @@ inherit_event(struct perf_event *parent_event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; @@ -7804,7 +8690,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -7911,8 +8797,10 @@ int perf_event_init_task(struct task_struct *child) for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); - if (ret) + if (ret) { + perf_event_free_task(child); return ret; + } } return 0; @@ -7926,7 +8814,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); } } @@ -7947,22 +8835,11 @@ static void perf_event_init_cpu(int cpu) } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = false }; + struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; - perf_pmu_rotate_stop(ctx->pmu); - rcu_read_lock(); list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) __perf_remove_from_context(&re); @@ -8073,6 +8950,18 @@ void __init perf_event_init(void) != 1024); } +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + if (pmu_attr->event_str) + return sprintf(page, "%s\n", pmu_attr->event_str); + + return 0; +} + static int __init perf_event_sysfs_init(void) { struct pmu *pmu; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,6 +27,7 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; @@ -35,6 +36,20 @@ struct ring_buffer { unsigned long mmap_locked; struct user_struct *mmap_user; + /* AUX area */ + local_t aux_head; + local_t aux_nest; + local_t aux_wakeup; + unsigned long aux_pgoff; + int aux_nr_pages; + int aux_overwrite; + atomic_t aux_mmap_count; + unsigned long aux_mmap_locked; + void (*free_aux)(void *); + atomic_t aux_refcount; + void **aux_pages; + void *aux_priv; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); +extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags); +extern void rb_free_aux(struct ring_buffer *rb); +extern struct ring_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct ring_buffer *rb); + +static inline bool rb_has_aux(struct ring_buffer *rb) +{ + return !!rb->aux_nr_pages; +} + +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags); extern void perf_event_header__init_id(struct perf_event_header *header, @@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +static inline unsigned long perf_aux_size(struct ring_buffer *rb) +{ + return rb->aux_nr_pages << PAGE_SHIFT; +} + #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 146a5792b1d2..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -13,12 +13,13 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> +#include <linux/poll.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLL_IN); + atomic_set(&handle->rb->poll, POLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); @@ -242,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +/* + * This is called before hardware starts writing to the AUX area to + * obtain an output handle and make sure there's room in the buffer. + * When the capture completes, call perf_aux_output_end() to commit + * the recorded data to the buffer. + * + * The ordering is similar to that of perf_output_{begin,end}, with + * the exception of (B), which should be taken care of by the pmu + * driver, since ordering rules will differ depending on hardware. + */ +void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *output_event = event; + unsigned long aux_head, aux_tail; + struct ring_buffer *rb; + + if (output_event->parent) + output_event = output_event->parent; + + /* + * Since this will typically be open across pmu::add/pmu::del, we + * grab ring_buffer's refcount instead of holding rcu read lock + * to make sure it doesn't disappear under us. + */ + rb = ring_buffer_get(output_event); + if (!rb) + return NULL; + + if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + goto err; + + /* + * Nesting is not supported for AUX area, make sure nested + * writers are caught early + */ + if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + goto err_put; + + aux_head = local_read(&rb->aux_head); + + handle->rb = rb; + handle->event = event; + handle->head = aux_head; + handle->size = 0; + + /* + * In overwrite mode, AUX data stores do not depend on aux_tail, + * therefore (A) control dependency barrier does not exist. The + * (B) <-> (C) ordering is still observed by the pmu driver. + */ + if (!rb->aux_overwrite) { + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } + } + + return handle->rb->aux_priv; + +err_put: + rb_free_aux(rb); + +err: + ring_buffer_put(rb); + handle->event = NULL; + + return NULL; +} + +/* + * Commit the data written by hardware into the ring buffer by adjusting + * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the + * pmu driver's responsibility to observe ordering rules of the hardware, + * so that all the data is externally visible before this is called. + */ +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + u64 flags = 0; + + if (truncated) + flags |= PERF_AUX_FLAG_TRUNCATED; + + /* in overwrite mode, driver provides aux_head via handle */ + if (rb->aux_overwrite) { + flags |= PERF_AUX_FLAG_OVERWRITE; + + aux_head = handle->head; + local_set(&rb->aux_head, aux_head); + } else { + aux_head = local_read(&rb->aux_head); + local_add(size, &rb->aux_head); + } + + if (size || flags) { + /* + * Only send RECORD_AUX if we have something useful to communicate + */ + + perf_event_aux_event(handle->event, aux_head, size, flags); + } + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + } + handle->event = NULL; + + local_set(&rb->aux_nest, 0); + rb_free_aux(rb); + ring_buffer_put(rb); +} + +/* + * Skip over a given number of bytes in the AUX buffer, due to, for example, + * hardware's alignment constraints. + */ +int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + + if (size > handle->size) + return -ENOSPC; + + local_add(size, &rb->aux_head); + + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + handle->wakeup = local_read(&rb->aux_wakeup) + + rb->aux_watermark; + } + + handle->head = aux_head; + handle->size -= size; + + return 0; +} + +void *perf_get_aux(struct perf_output_handle *handle) +{ + /* this is only valid between perf_aux_output_begin and *_end */ + if (!handle->event) + return NULL; + + return handle->rb->aux_priv; +} + +#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) + +static struct page *rb_alloc_aux_page(int node, int order) +{ + struct page *page; + + if (order > MAX_ORDER) + order = MAX_ORDER; + + do { + page = alloc_pages_node(node, PERF_AUX_GFP, order); + } while (!page && order--); + + if (page && order) { + /* + * Communicate the allocation size to the driver + */ + split_page(page, order); + SetPagePrivate(page); + set_page_private(page, order); + } + + return page; +} + +static void rb_free_aux_page(struct ring_buffer *rb, int idx) +{ + struct page *page = virt_to_page(rb->aux_pages[idx]); + + ClearPagePrivate(page); + page->mapping = NULL; + __free_page(page); +} + +int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, long watermark, int flags) +{ + bool overwrite = !(flags & RING_BUFFER_WRITABLE); + int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int ret = -ENOMEM, max_order = 0; + + if (!has_aux(event)) + return -ENOTSUPP; + + if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); + + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && + !overwrite) { + if (!max_order) + return -EINVAL; + + max_order--; + } + } + + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + if (!rb->aux_pages) + return -ENOMEM; + + rb->free_aux = event->pmu->free_aux; + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { + struct page *page; + int last, order; + + order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); + page = rb_alloc_aux_page(node, order); + if (!page) + goto out; + + for (last = rb->aux_nr_pages + (1 << page_private(page)); + last > rb->aux_nr_pages; rb->aux_nr_pages++) + rb->aux_pages[rb->aux_nr_pages] = page_address(page++); + } + + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + overwrite); + if (!rb->aux_priv) + goto out; + + ret = 0; + + /* + * aux_pages (and pmu driver's private data, aux_priv) will be + * referenced in both producer's and consumer's contexts, thus + * we keep a refcount here to make sure either of the two can + * reference them safely. + */ + atomic_set(&rb->aux_refcount, 1); + + rb->aux_overwrite = overwrite; + rb->aux_watermark = watermark; + + if (!rb->aux_watermark && !rb->aux_overwrite) + rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); + +out: + if (!ret) + rb->aux_pgoff = pgoff; + else + rb_free_aux(rb); + + return ret; +} + +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; +} + +void rb_free_aux(struct ring_buffer *rb) +{ + if (atomic_dec_and_test(&rb->aux_refcount)) + __rb_free_aux(rb); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -339,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb) return rb->nr_pages << page_order(rb); } -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -415,3 +719,19 @@ fail: } #endif + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (rb->aux_nr_pages) { + /* above AUX space */ + if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) + return NULL; + + /* AUX space */ + if (pgoff >= rb->aux_pgoff) + return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + } + + return __perf_mmap_to_page(rb, pgoff); +} diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..cb346f26a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -186,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); @@ -723,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), @@ -754,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; @@ -1639,7 +1640,6 @@ bool uprobe_deny_signal(void) if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 83d4382f5699..6873bb3e6b7e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -20,145 +20,10 @@ #include <linux/types.h> #include <linux/fs_struct.h> - -static void default_handler(int, struct pt_regs *); - -static struct exec_domain *exec_domains = &default_exec_domain; -static DEFINE_RWLOCK(exec_domains_lock); - - -static unsigned long ident_map[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -struct exec_domain default_exec_domain = { - .name = "Linux", /* name */ - .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ - .pers_high = 0, /* PER_LINUX personality. */ - .signal_map = ident_map, /* Identity map signals. */ - .signal_invmap = ident_map, /* - both ways. */ -}; - - -static void -default_handler(int segment, struct pt_regs *regp) -{ - set_personality(0); - - if (current_thread_info()->exec_domain->handler != default_handler) - current_thread_info()->exec_domain->handler(segment, regp); - else - send_sig(SIGSEGV, current, 1); -} - -static struct exec_domain * -lookup_exec_domain(unsigned int personality) -{ - unsigned int pers = personality(personality); - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } - -#ifdef CONFIG_MODULES - read_unlock(&exec_domains_lock); - request_module("personality-%d", pers); - read_lock(&exec_domains_lock); - - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } -#endif - - ep = &default_exec_domain; -out: - read_unlock(&exec_domains_lock); - return ep; -} - -int -register_exec_domain(struct exec_domain *ep) -{ - struct exec_domain *tmp; - int err = -EBUSY; - - if (ep == NULL) - return -EINVAL; - - if (ep->next != NULL) - return -EBUSY; - - write_lock(&exec_domains_lock); - for (tmp = exec_domains; tmp; tmp = tmp->next) { - if (tmp == ep) - goto out; - } - - ep->next = exec_domains; - exec_domains = ep; - err = 0; - -out: - write_unlock(&exec_domains_lock); - return err; -} -EXPORT_SYMBOL(register_exec_domain); - -int -unregister_exec_domain(struct exec_domain *ep) -{ - struct exec_domain **epp; - - epp = &exec_domains; - write_lock(&exec_domains_lock); - for (epp = &exec_domains; *epp; epp = &(*epp)->next) { - if (ep == *epp) - goto unregister; - } - write_unlock(&exec_domains_lock); - return -EINVAL; - -unregister: - *epp = ep->next; - ep->next = NULL; - write_unlock(&exec_domains_lock); - return 0; -} -EXPORT_SYMBOL(unregister_exec_domain); - -int __set_personality(unsigned int personality) -{ - struct exec_domain *oep = current_thread_info()->exec_domain; - - current_thread_info()->exec_domain = lookup_exec_domain(personality); - current->personality = personality; - module_put(oep->module); - - return 0; -} -EXPORT_SYMBOL(__set_personality); - #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) { - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) - seq_printf(m, "%d-%d\t%-16s\t[%s]\n", - ep->pers_low, ep->pers_high, ep->name, - module_name(ep->module)); - read_unlock(&exec_domains_lock); + seq_puts(m, "0-0\tLinux \t[kernel]\n"); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..22fcc05dec40 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. + */ + task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -151,7 +149,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +166,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +190,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -213,27 +212,6 @@ repeat: } /* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -241,7 +219,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +273,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +384,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +404,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -455,6 +435,46 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); +} + +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + +static struct task_struct *find_child_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + + if (likely(reaper != father)) + return reaper; + + reaper = find_alive_thread(father); + if (reaper) { + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: father->exit_code); + } + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; } /* @@ -464,58 +484,36 @@ static void exit_mm(struct task_struct * tsk) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; + struct task_struct *thread, *reaper; - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; + thread = find_alive_thread(father); + if (thread) return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { - struct task_struct *reaper; + if (father->signal->has_child_subreaper) { /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We start from father to ensure we can not look into another + * namespace, this is safe because all its threads are dead. */ - for (reaper = father->real_parent; - reaper != &init_task; + for (reaper = father; + !same_thread_group(reaper, child_reaper); reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + /* call_usermodehelper() descendants need this check */ + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { - if (!(thread->flags & PF_EXITING)) - return reaper; - } while_each_thread(reaper, thread); + thread = find_alive_thread(reaper); + if (thread) + return thread; } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -524,15 +522,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ @@ -543,48 +533,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, dead); - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; - do { + /* Can drop and reacquire tasklist_lock */ + reaper = find_child_reaper(father); + if (list_empty(&father->children)) + return; + + reaper = find_new_reaper(father, reaper); + list_for_each_entry(p, &father->children, sibling) { + for_each_thread(p, t) { t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + BUG_ON((!t->ptrace) != (t->parent == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); + } + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, dead); } + list_splice_tail_init(&father->children, &reaper->children); } /* @@ -594,18 +589,12 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -623,15 +612,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (tsk->exit_state == EXIT_DEAD) + list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } #ifdef CONFIG_DEBUG_STACK_USAGE @@ -648,9 +640,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -663,6 +654,7 @@ void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -691,8 +683,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -716,9 +707,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -765,13 +756,12 @@ void do_exit(long code) cgroup_exit(tsk); - module_put(task_thread_info(tsk)->exec_domain->module); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -811,6 +801,7 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed @@ -836,7 +827,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -846,7 +836,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +858,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -976,8 +966,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, retval, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; @@ -991,6 +980,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1000,21 +991,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } return wait_noreap_copyout(wo, p, pid, uid, why, status); } - - traced = ptrace_reparented(p); /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * We own this thread, nobody else can reap it. */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + read_unlock(&tasklist_lock); + sched_annotate_sleep(); + + /* + * Check thread_group_leader() to exclude the traced sub-threads. + */ + if (state == EXIT_DEAD && thread_group_leader(p)) { + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; cputime_t tgutime, tgstime; @@ -1026,21 +1021,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-theads + * which can reap other children at the same time. Until + * we change k_getrusage()-like users to rely on this lock + * we have to take ->siglock as well. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + spin_lock_irq(¤t->sighand->siglock); + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1063,15 +1058,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->real_parent->sighand->siglock); + write_sequnlock(&psig->stats_lock); + spin_unlock_irq(¤t->sighand->siglock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we its state == DEAD/TRACE. - */ - read_unlock(&tasklist_lock); - retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) @@ -1202,6 +1192,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1264,6 +1255,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage @@ -1294,9 +1286,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); int ret; - if (unlikely(p->exit_state == EXIT_DEAD)) + if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, p); @@ -1317,7 +1315,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (unlikely(p->exit_state == EXIT_TRACE)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. @@ -1344,7 +1342,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* @@ -1417,6 +1415,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1430,6 +1429,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } diff --git a/kernel/extable.c b/kernel/extable.c index d8a6446adbcb..c98f926277a8 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -18,6 +18,7 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> +#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> @@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; + if (is_ftrace_trampoline(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; - return is_module_text_address(addr); + if (is_module_text_address(addr)) + return 1; + return is_ftrace_trampoline(addr); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46cd87d..03c1eaaa6ef5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> +#include <linux/sysctl.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -88,6 +89,16 @@ #include <trace/events/task.h> /* + * Minimum number of threads to boot the kernel + */ +#define MIN_THREADS 20 + +/* + * Maximum number of threads + */ +#define MAX_THREADS FUTEX_TID_MASK + +/* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct); void __init __weak arch_task_cache_init(void) { } -void __init fork_init(unsigned long mempages) +/* + * set_max_threads + */ +static void set_max_threads(unsigned int max_threads_suggested) +{ + u64 threads; + + /* + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. + */ + if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + + if (threads > max_threads_suggested) + threads = max_threads_suggested; + + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); +} + +void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN @@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages) /* do the arch specific task caches init */ arch_task_cache_init(); - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if (max_threads < 20) - max_threads = 20; + set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; @@ -294,11 +317,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -315,12 +345,20 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti; tsk->stack = ti; +#ifdef CONFIG_SECCOMP + /* + * We must handle setting up seccomp filters once we're under + * the sighand lock in case orig has changed between now and + * then. Until then, filter must be NULL to avoid messing up + * the usage counts on the error path calling free_task. + */ + tsk->seccomp.filter = NULL; +#endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); @@ -365,12 +403,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -419,19 +459,15 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* @@ -495,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -#define dup_mmap(mm, oldmm) (0) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -527,19 +569,38 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm_nr_pmds_init(mm); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +610,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm; +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -570,8 +637,15 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } + if (atomic_long_read(&mm->nr_ptes)) + pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", + atomic_long_read(&mm->nr_ptes)); + if (mm_nr_pmds(mm)) + pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", + mm_nr_pmds(mm)); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -587,7 +661,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -633,34 +706,53 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/** + * set_mm_exe_file - change a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). + * + * Main users are mmput() and sys_execve(). Callers prevent concurrent + * invocations: in mmput() nobody alive left, in execve task is single + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the + * mm->exe_file, but does so without using set_mm_exe_file() in order + * to do avoid the need for any locks. + */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { + struct file *old_exe_file; + + /* + * It is safe to dereference the exe_file without RCU as + * this function is only called if nobody else can access + * this mm -- see comment above for justification. + */ + old_exe_file = rcu_dereference_raw(mm->exe_file); + if (new_exe_file) get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; + rcu_assign_pointer(mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); } +/** + * get_mm_exe_file - acquire a reference to the mm's executable file + * + * Returns %NULL if mm has no associated executable file. + * User must release file via fput(). + */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of exe_file */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); + rcu_read_lock(); + exe_file = rcu_dereference(mm->exe_file); + if (exe_file && !get_file_rcu(exe_file)) + exe_file = NULL; + rcu_read_unlock(); return exe_file; } - -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} +EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_mm - acquire a reference to the task's mm @@ -819,19 +911,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - - dup_mm_exe_file(oldmm, mm); - err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -851,15 +934,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1002,11 +1076,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); + /* + * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * without an RCU grace period, see __lock_task_sighand(). + */ kmem_cache_free(sighand_cachep, sighand); } } - /* * Initialize POSIX timer handling for a thread group. */ @@ -1053,6 +1130,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1081,6 +1159,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + /* + * Must be called with sighand->lock held, which is common to + * all threads in the group. Holding cred_guard_mutex is not + * needed because this new task is not yet running and cannot + * be racing exec. + */ + assert_spin_locked(¤t->sighand->siglock); + + /* Ref-count the new filter user, and assign it. */ + get_seccomp_filter(current); + p->seccomp = current->seccomp; + + /* + * Explicitly enable no_new_privs here in case it got set + * between the task_struct being duplicated and holding the + * sighand lock. The seccomp state and nnp must be in sync. + */ + if (task_no_new_privs(current)) + task_set_no_new_privs(p); + + /* + * If the parent gained a seccomp mode after copying thread + * flags and between before we held the sighand lock, we have + * to manually enable the seccomp thread flag here. + */ + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) + set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} + SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1095,17 +1206,9 @@ static void rt_mutex_init_task(struct task_struct *p) p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; - p->pi_top_task = NULL; #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1196,7 +1299,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); - get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1226,9 +1328,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (nr_threads >= max_threads) goto bad_fork_cleanup_count; - if (!try_module_get(task_thread_info(p)->exec_domain->module)) - goto bad_fork_cleanup_count; - delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); p->flags |= PF_FORKNOEXEC; @@ -1262,9 +1361,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) @@ -1307,10 +1405,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1326,8 +1420,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1357,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns_for_children); - if (!pid) + if (IS_ERR(pid)) { + retval = PTR_ERR(pid); goto bad_fork_cleanup_io; + } } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1437,6 +1533,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_lock(¤t->sighand->siglock); /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to @@ -1525,8 +1627,9 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: @@ -1534,7 +1637,6 @@ bad_fork_cleanup_threadgroup_lock: if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); delayacct_tsk_free(p); - module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); @@ -1873,6 +1975,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); @@ -1943,3 +2050,26 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int ret; + int threads = max_threads; + int min = MIN_THREADS; + int max = MAX_THREADS; + + t = *table; + t.data = &threads; + t.extra1 = &min; + t.extra2 = &max; + + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + set_max_threads(threads); + + return 0; +} diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); diff --git a/kernel/futex.c b/kernel/futex.c index b632b5f3f094..2579e407ff67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key) case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies MB (B) */ break; + default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ + smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { @@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -792,94 +806,91 @@ void exit_pi_state_list(struct task_struct *curr) * [10] There is no transient state which leaves owner and user space * TID out of sync. */ -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, + struct futex_pi_state **ps) { - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (match_futex(&this->key, key)) { - /* - * Sanity check the waiter before increasing - * the refcount and attaching to it. - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non-PI and - * PI futexes [3] - */ - if (unlikely(!pi_state)) - return -EINVAL; + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!atomic_read(&pi_state->refcount)); + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { /* - * Handle the owner died case: + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] */ - if (uval & FUTEX_OWNER_DIED) { - /* - * exit_pi_state_list sets owner to NULL and - * wakes the topmost waiter. The task which - * acquires the pi_state->rt_mutex will fixup - * owner. - */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user - * space TID is not 0. Inconsistent - * state. [5] - */ - if (pid) - return -EINVAL; - /* - * Take a ref on the state and - * return. [4] - */ - goto out_state; - } - - /* - * If TID is 0, then either the dying owner - * has not yet executed exit_pi_state_list() - * or some waiter acquired the rtmutex in the - * pi state, but did not yet fixup the TID in - * user space. - * - * Take a ref on the state and return. [6] - */ - if (!pid) - goto out_state; - } else { - /* - * If the owner died bit is not set, - * then the pi_state must have an - * owner. [7] - */ - if (!pi_state->owner) - return -EINVAL; - } - + if (pid) + return -EINVAL; /* - * Bail out if user space manipulated the - * futex value. If pi state exists then the - * owner TID must be the same as the user - * space TID. [9/10] + * Take a ref on the state and return success. [4] */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - - out_state: - atomic_inc(&pi_state->refcount); - *ps = pi_state; - return 0; + goto out_state; } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_state; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + return -EINVAL; } /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; +out_state: + atomic_inc(&pi_state->refcount); + *ps = pi_state; + return 0; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 uval, union futex_key *key, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; + struct task_struct *p; + + /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] */ @@ -889,7 +900,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; - if (!p->mm) { + if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); return -EPERM; } @@ -920,7 +931,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, pi_state = alloc_pi_state(); /* - * Initialize the pi_mutex in locked state and make 'p' + * Initialize the pi_mutex in locked state and make @p * the owner of it: */ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); @@ -940,6 +951,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) +{ + struct futex_q *match = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ + return attach_to_pi_owner(uval, key, ps); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 uninitialized_var(curval); + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + + /*If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + /** * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address @@ -963,113 +1004,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, force_take = 0; - u32 uval, newval, curval, vpid = task_pid_vnr(task); - -retry: - ret = lock_taken = 0; + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *match; + int ret; /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. + * Read the user space value first so we can validate a few + * things before proceeding further. */ - newval = vpid; - if (set_waiters) - newval |= FUTEX_WAITERS; - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) + if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* - * Surprise - we got the lock, but we do not trust user space at all. + * Lookup existing state first. If it exists, try to attach to + * its pi_state. */ - if (unlikely(!curval)) { - /* - * We verify whether there is kernel state for this - * futex. If not, we can safely assume, that the 0 -> - * TID transition is correct. If state exists, we do - * not bother to fixup the user space state as it was - * corrupted already. - */ - return futex_top_waiter(hb, key) ? -EINVAL : 1; - } - - uval = curval; + match = futex_top_waiter(hb, key); + if (match) + return attach_to_pi_state(uval, match->pi_state, ps); /* - * Set the FUTEX_WAITERS flag, so the owner will know it has someone - * to wake at the next unlock. + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. */ - newval = curval | FUTEX_WAITERS; - - /* - * Should we force take the futex? See below. - */ - if (unlikely(force_take)) { + if (!(uval & FUTEX_TID_MASK)) { /* - * Keep the OWNER_DIED and the WAITERS bit and set the - * new TID value. + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. */ - newval = (curval & ~FUTEX_TID_MASK) | vpid; - force_take = 0; - lock_taken = 1; - } + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; - if (unlikely(curval != uval)) - goto retry; + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + /* If the take over worked, return 1 */ + return ret < 0 ? ret : 1; + } /* - * We took the lock due to forced take over. + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. */ - if (unlikely(lock_taken)) - return 1; - + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. */ - ret = lookup_pi_state(uval, hb, key, ps); - - if (unlikely(ret)) { - switch (ret) { - case -ESRCH: - /* - * We failed to find an owner for this - * futex. So we have no pi_state to block - * on. This can happen in two cases: - * - * 1) The owner died - * 2) A stale FUTEX_WAITERS bit - * - * Re-read the futex value. - */ - if (get_futex_value_locked(&curval, uaddr)) - return -EFAULT; - - /* - * If the owner died or we have a stale - * WAITERS bit the owner TID in the user space - * futex is 0. - */ - if (!(curval & FUTEX_TID_MASK)) { - force_take = 1; - goto retry; - } - default: - break; - } - } - - return ret; + return attach_to_pi_owner(uval, key, ps); } /** @@ -1186,22 +1183,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) return 0; } -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 uninitialized_var(oldval); - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) - return -EFAULT; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - /* * Express the locking dependencies for lockdep: */ @@ -1552,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1650,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1659,7 +1633,14 @@ retry_private: goto retry; goto out; case -EAGAIN: - /* The owner was exiting, try again. */ + /* + * Two reasons for this: + * - Owner is exiting and we just wait for the + * exit to complete. + * - The user space value changed. + */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1718,7 +1699,7 @@ retry_private: this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, - this->task, 1); + this->task); if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); @@ -1736,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1753,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } @@ -2237,7 +2217,7 @@ retry: if (!abs_time) goto out; - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; @@ -2278,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -2316,8 +2296,10 @@ retry_private: goto uaddr_faulted; case -EAGAIN: /* - * Task is exiting and we just wait for the - * exit to complete. + * Two reasons for this: + * - Task is exiting and we just wait for the + * exit to complete. + * - The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); @@ -2337,9 +2319,9 @@ retry_private: /* * Block on the PI mutex: */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { + if (!trylock) { + ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); + } else { ret = rt_mutex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; @@ -2401,10 +2383,10 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - struct futex_hash_bucket *hb; - struct futex_q *this, *next; + u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - u32 uval, vpid = task_pid_vnr(current); + struct futex_hash_bucket *hb; + struct futex_q *match; int ret; retry: @@ -2417,57 +2399,47 @@ retry: return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; + if (ret) + return ret; hb = hash_futex(&key); spin_lock(&hb->lock); /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up. We only try this if neither the waiters nor - * the owner died bit are set. - */ - if (!(uval & ~FUTEX_TID_MASK) && - cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == vpid)) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. */ - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); + match = futex_top_waiter(hb, &key); + if (match) { + ret = wake_futex_pi(uaddr, uval, match); /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: */ if (ret == -EFAULT) goto pi_faulted; goto out_unlock; } + /* - * No waiters - kernel unlocks the futex: + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) goto pi_faulted; + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + out_unlock: spin_unlock(&hb->lock); put_futex_key(&key); - -out: return ret; pi_faulted: @@ -2628,6 +2600,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * shared futexes. We need to compare the keys: */ if (match_futex(&q.key, &key2)) { + queue_unlock(hb); ret = -EINVAL; goto out_put_keys; } @@ -2669,7 +2642,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); debug_rt_mutex_free_waiter(&rt_waiter); spin_lock(q.lock_ptr); @@ -2980,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - return futex_lock_pi(uaddr, flags, val, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, 0, timeout, 1); + return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8ac4399..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -32,10 +32,13 @@ config GCOV_KERNEL Note that the debugfs filesystem has to be mounted to access profiling data. +config ARCH_HAS_GCOV_PROFILE_ALL + def_bool n + config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE + depends on ARCH_HAS_GCOV_PROFILE_ALL default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 52aa7e8de927..752d6486b67e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,33 +1,7 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -# if-lt -# Usage VAR := $(call if-lt, $(a), $(b)) -# Returns 1 if (a < b) -if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) - -ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) - cc-ver := 0304 -else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) - cc-ver := 0407 -else -# Use cc-version if available, otherwise set 0 -# -# scripts/Kbuild.include, which contains cc-version function, is not included -# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" -# Meaning cc-ver is empty causing if-lt test to fail with -# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. -# This has no affect on the clean phase, but the error message could be -# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version -# is not available. We can probably move if-lt to Kbuild.include, so it's also -# not defined during clean or to include Kbuild.include in -# scripts/Makefile.clean. But the following workaround seems least invasive. - cc-ver := $(if $(call cc-version),$(call cc-version),0) -endif - -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o - -ifeq ($(call if-lt, $(cc-ver), 0407),1) - obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o -else - obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o -endif +obj-y := base.o fs.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ + gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index b358a802fd18..a744098e4eb7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched.h> #include "gcov.h" static int gcov_events_enabled; @@ -107,8 +108,10 @@ void gcov_enable_events(void) gcov_events_enabled = 1; /* Perform event callback for previously registered entries. */ - while ((info = gcov_info_next(info))) + while ((info = gcov_info_next(info))) { gcov_event(GCOV_ADD, info); + cond_resched(); + } mutex_unlock(&gcov_lock); } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cfa..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,11 +6,9 @@ #include <linux/slab.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - struct group_info *groups_alloc(int gidsetsize) { struct group_info *group_info; @@ -213,6 +211,14 @@ out: return i; } +bool may_setgroups(void) +{ + struct user_namespace *user_ns = current_user_ns(); + + return ns_capable(user_ns, CAP_SETGID) && + userns_may_setgroups(user_ns); +} + /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. @@ -223,7 +229,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) return; rcu_read_lock(); - do_each_thread(g, t) { + for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (!--batch_count) { @@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); - } while_each_thread(g, t); + } unlock: rcu_read_unlock(); } diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +# Support for hierarchical irq domains +config IRQ_DOMAIN_HIERARCHY + bool + select IRQ_DOMAIN + +# Generic MSI interrupt support +config GENERIC_MSI_IRQ + bool + +# Generic MSI hierarchical interrupt domain support +config GENERIC_MSI_IRQ_DOMAIN + bool + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + +config HANDLE_DOMAIN_IRQ + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o +obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/irqdomain.h> #include <trace/events/irq.h> @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_state_clr_disabled(desc); desc->depth = 0; + irq_domain_activate_irq(&desc->irq_data); if (desc->irq_data.chip->irq_startup) { ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_domain_deactivate_irq(&desc->irq_data); irq_state_set_masked(desc); } @@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc) return irq_wait_for_poll(desc); } +static bool irq_may_run(struct irq_desc *desc) +{ + unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + + /* + * If the interrupt is not in progress and is not an armed + * wakeup interrupt, proceed. + */ + if (!irqd_has_set(&desc->irq_data, mask)) + return true; + + /* + * If the interrupt is an armed wakeup source, mark it pending + * and suspended, disable it and notify the pm core about the + * event. + */ + if (irq_pm_check_wakeup(desc)) + return false; + + /* + * Handle a potential concurrent poll on a different core. + */ + return irq_check_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out; + if (!irq_may_run(desc)) + goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -517,6 +542,7 @@ out: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler @@ -540,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If its disabled or no action available then mask it and get + * out of here. */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; - } + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; } + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ @@ -601,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If its disabled or no action available then mask it and get + * out of here. */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + goto out_eoi; } + kstat_incr_irqs_this_cpu(irq, desc); do { @@ -669,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = __this_cpu_ptr(action->percpu_dev_id); + void *dev_id = raw_cpu_ptr(action->percpu_dev_id); irqreturn_t res; kstat_incr_irqs_this_cpu(irq, desc); @@ -698,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) { handle = handle_bad_irq; } else { - if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + struct irq_data *irq_data = &desc->irq_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* + * With hierarchical domains we might run into a + * situation where the outermost chip is not yet set + * up, but the inner chips are there. Instead of + * bailing we install the handler, but obviously we + * cannot enable/startup the interrupt at this point. + */ + while (irq_data) { + if (irq_data->chip != &no_irq_chip) + break; + /* + * Bail out if the outer chip is not set up + * and the interrrupt supposed to be started + * right away. + */ + if (WARN_ON(is_chained)) + goto out; + /* Try the parent */ + irq_data = irq_data->parent_data; + } +#endif + if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) goto out; } @@ -817,3 +873,121 @@ void irq_cpu_offline(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_chip_ack_parent - Acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_ack(data); +} + +/** + * irq_chip_mask_parent - Mask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask(data); +} + +/** + * irq_chip_unmask_parent - Unmask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_unmask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_unmask(data); +} + +/** + * irq_chip_eoi_parent - Invoke EOI on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_eoi_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_eoi(data); +} + +/** + * irq_chip_set_affinity_parent - Set affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Conditinal, as the underlying parent chip might not implement it. + */ +int irq_chip_set_affinity_parent(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + data = data->parent_data; + if (data->chip->irq_set_affinity) + return data->chip->irq_set_affinity(data, dest, force); + + return -ENOSYS; +} + +/** + * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware + * @data: Pointer to interrupt specific data + * + * Iterate through the domain hierarchy of the interrupt and check + * whether a hw retrigger function exists. If yes, invoke it. + */ +int irq_chip_retrigger_hierarchy(struct irq_data *data) +{ + for (data = data->parent_data; data; data = data->parent_data) + if (data->chip && data->chip->irq_retrigger) + return data->chip->irq_retrigger(data); + + return -ENOSYS; +} + +/** + * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt + * @data: Pointer to interrupt specific data + * @on: Whether to set or reset the wake-up capability of this irq + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) +{ + data = data->parent_data; + if (data->chip->irq_set_wake) + return data->chip->irq_set_wake(data, on); + + return -ENOSYS; +} +#endif + +/** + * irq_chip_compose_msi_msg - Componse msi message for a irq chip + * @data: Pointer to interrupt specific data + * @msg: Pointer to the MSI message + * + * For hierarchical domains we find the first chip in the hierarchy + * which implements the irq_compose_msi_msg callback. For non + * hierarchical we use the top level chip. + */ +int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_data *pos = NULL; + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + for (; data; data = data->parent_data) +#endif + if (data->chip && data->chip->irq_compose_msi_msg) + pos = data; + if (!pos) + return -ENOSYS; + + pos->chip->irq_compose_msi_msg(pos, msg); + + return 0; +} diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef0606797c9..d5d0f7345c54 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as - * request_irq(). IRQs requested with this function will be + * request_threaded_irq(). IRQs requested with this function will be * automatically freed on driver detach. * * If an IRQ allocated with this function needs to be freed diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2ba21d..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache |= mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache &= ~mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.mask); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); + irq_reg_writel(gc, mask, ct->regs.eoi); irq_gc_unlock(gc); } @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static u32 irq_readl_be(void __iomem *addr) +{ + return ioread32be(addr); +} + +static void irq_writel_be(u32 val, void __iomem *addr) +{ + iowrite32be(val, addr); +} + static void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } ct[i].mask_cache = mskptr; if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); + *mskptr = irq_reg_readl(gc, mskreg); } } @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, dgc->gc[i] = gc = tmp; irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, NULL, handler); + gc->domain = d; + if (gcflags & IRQ_GC_BE_IO) { + gc->reg_readl = &irq_readl_be; + gc->reg_writel = &irq_writel_be; + } + raw_spin_lock_irqsave(&gc_lock, flags); list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); @@ -341,8 +357,8 @@ static struct lock_class_key irq_nested_lock_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain */ -static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw_irq) +int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain_chip_generic *dgc = d->gc; @@ -394,6 +410,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } +EXPORT_SYMBOL_GPL(irq_map_generic_chip); struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 099ea2e0eb88..df553b0af936 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -63,8 +63,8 @@ enum { extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); @@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } +extern void irq_lock_sparse(void); +extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); +static inline void irq_lock_sparse(void) { } +static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); @@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } + +#ifdef CONFIG_PM_SLEEP +bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); +#else +static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void +irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } +static inline void +irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } +#endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..99793b9b6d23 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -14,6 +14,7 @@ #include <linux/kernel_stat.h> #include <linux/radix-tree.h> #include <linux/bitmap.h> +#include <linux/irqdomain.h> #include "internals.h" @@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc) static inline void free_masks(struct irq_desc *desc) { } #endif +void irq_lock_sparse(void) +{ + mutex_lock(&sparse_irq_lock); +} + +void irq_unlock_sparse(void) +{ + mutex_unlock(&sparse_irq_lock); +} + static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) { struct irq_desc *desc; @@ -167,6 +178,12 @@ static void free_desc(unsigned int irq) unregister_irq_proc(irq, desc); + /* + * sparse_irq_lock protects also show_interrupts() and + * kstat_irq_usr(). Once we deleted the descriptor from the + * sparse tree we can free it. Access in proc will fail to + * lookup the descriptor. + */ mutex_lock(&sparse_irq_lock); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/** + * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @lookup: Whether to perform the domain lookup or not + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq = hwirq; + int ret = 0; + + irq_enter(); + +#ifdef CONFIG_IRQ_DOMAIN + if (lookup) + irq = irq_find_mapping(domain, hwirq); +#endif + + /* + * Some hardware gives randomly wrong interrupts. Rather + * than crashing, do something sensible. + */ + if (unlikely(!irq || irq >= nr_irqs)) { + ack_bad_irq(irq); + ret = -EINVAL; + } else { + generic_handle_irq(irq); + } + + irq_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif + /* Dynamic interrupt handling */ /** @@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq) kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); } +/** + * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu + * @irq: The interrupt number + * @cpu: The cpu number + * + * Returns the sum of interrupt counts on @cpu since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); @@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +/** + * kstat_irqs - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. The caller must ensure that the interrupt is not removed + * concurrently. + */ unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } + +/** + * kstat_irqs_usr - Get the statistics for an interrupt + * @irq: The interrupt number + * + * Returns the sum of interrupt counts on all cpus since boot for + * @irq. Contrary to kstat_irqs() this can be called from any + * preemptible context. It's protected against concurrent removal of + * an interrupt descriptor when sparse irqs are enabled. + */ +unsigned int kstat_irqs_usr(unsigned int irq) +{ + int sum; + + irq_lock_sparse(); + sum = kstat_irqs(irq); + irq_unlock_sparse(); + return sum; +} diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e32e05..7fac311057b8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); +static void irq_domain_check_hierarchy(struct irq_domain *domain); + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates and initialize and irq_domain structure. @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; + irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also @@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, domain = __irq_domain_add(of_node, first_hwirq + size, first_hwirq + size, 0, ops, host_data); - if (!domain) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); + if (domain) + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } @@ -231,7 +234,7 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; @@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int hint; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - hint = hwirq % nr_irqs; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); - if (virq <= 0) - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, + of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; - unsigned int virq; + int virq; domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { @@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) return 0; } - /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); - if (!virq) - return virq; + if (irq_domain_is_hierarchy(domain)) { + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) + return virq; + + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + if (virq <= 0) + return 0; + } else { + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; + } /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && @@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; if (hwirq < domain->revmap_direct_max_irq) { - data = irq_get_irq_data(hwirq); - if (data && (data->domain == domain) && (data->hwirq == hwirq)) + data = irq_domain_get_irq_data(domain, hwirq); + if (data && data->hwirq == hwirq) return hwirq; } @@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +static int irq_domain_alloc_descs(int virq, unsigned int cnt, + irq_hw_number_t hwirq, int node) +{ + unsigned int hint; + + if (virq >= 0) { + virq = irq_alloc_descs(virq, virq, cnt, node); + } else { + hint = hwirq % nr_irqs; + if (hint == 0) + hint++; + virq = irq_alloc_descs_from(hint, cnt, node); + if (virq <= 0 && hint > 1) + virq = irq_alloc_descs_from(1, cnt, node); + } + + return virq; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * @parent: Parent irq domain to associate with the new domain + * @flags: Irq domain flags associated to the domain + * @size: Size of the domain. See below + * @node: Optional device-tree node of the interrupt controller + * @ops: Pointer to the interrupt domain callbacks + * @host_data: Controller private data pointer + * + * If @size is 0 a tree domain is created, otherwise a linear domain. + * + * If successful the parent is associated to the new domain and the + * domain flags are set. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct device_node *node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + if (size) + domain = irq_domain_add_linear(node, size, ops, host_data); + else + domain = irq_domain_add_tree(node, ops, host_data); + if (domain) { + domain->parent = parent; + domain->flags |= flags; + } + + return domain; +} + +static void irq_domain_insert_irq(int virq) +{ + struct irq_data *data; + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, data); + mutex_unlock(&revmap_trees_mutex); + } + + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && data->chip) + domain->name = data->chip->name; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); +} + +static void irq_domain_remove_irq(int virq) +{ + struct irq_data *data; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + irq_set_chip_and_handler(virq, NULL, NULL); + synchronize_irq(virq); + smp_mb(); + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + } + } +} + +static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, + struct irq_data *child) +{ + struct irq_data *irq_data; + + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + if (irq_data) { + child->parent_data = irq_data; + irq_data->irq = child->irq; + irq_data->node = child->node; + irq_data->domain = domain; + } + + return irq_data; +} + +static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data, *tmp; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + tmp = irq_data->parent_data; + irq_data->parent_data = NULL; + irq_data->domain = NULL; + + while (tmp) { + irq_data = tmp; + tmp = tmp->parent_data; + kfree(irq_data); + } + } +} + +static int irq_domain_alloc_irq_data(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct irq_domain *parent; + int i; + + /* The outermost irq_data is embedded in struct irq_desc */ + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + irq_data->domain = domain; + + for (parent = domain->parent; parent; parent = parent->parent) { + irq_data = irq_domain_insert_irq_data(parent, irq_data); + if (!irq_data) { + irq_domain_free_irq_data(virq, i + 1); + return -ENOMEM; + } + } + } + + return 0; +} + +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data; + + for (irq_data = irq_get_irq_data(virq); irq_data; + irq_data = irq_data->parent_data) + if (irq_data->domain == domain) + return irq_data; + + return NULL; +} + +/** + * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hwirq number + * @chip: The associated interrupt chip + * @chip_data: The associated chip data + */ +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + if (!irq_data) + return -ENOENT; + + irq_data->hwirq = hwirq; + irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip_data = chip_data; + + return 0; +} + +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); + __irq_set_handler(virq, handler, 0, handler_name); + irq_set_handler_data(virq, handler_data); +} + +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} + +/** + * irq_domain_free_irqs_common - Clear irq_data and free the parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data) + irq_domain_reset_irq_data(irq_data); + } + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +/** + * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_handler_data(virq + i, NULL); + irq_set_handler(virq + i, NULL); + } + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static bool irq_domain_is_auto_recursive(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; +} + +static void irq_domain_free_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs) +{ + domain->ops->free(domain, irq_base, nr_irqs); + if (irq_domain_is_auto_recursive(domain)) { + BUG_ON(!domain->parent); + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); + } +} + +static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) +{ + int ret = 0; + struct irq_domain *parent = domain->parent; + bool recursive = irq_domain_is_auto_recursive(domain); + + BUG_ON(recursive && !parent); + if (recursive) + ret = irq_domain_alloc_irqs_recursive(parent, irq_base, + nr_irqs, arg); + if (ret >= 0) + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0 && recursive) + irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); + + return ret; +} + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program hardwares with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc) +{ + int i, ret, virq; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + + if (realloc && irq_base >= 0) { + virq = irq_base; + } else { + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + if (virq < 0) { + pr_debug("cannot allocate IRQ(base %d, count %d)\n", + irq_base, nr_irqs); + return virq; + } + } + + if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { + pr_debug("cannot allocate memory for IRQ%d\n", virq); + ret = -ENOMEM; + goto out_free_desc; + } + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + if (ret < 0) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + for (i = 0; i < nr_irqs; i++) + irq_domain_insert_irq(virq + i); + mutex_unlock(&irq_domain_mutex); + + return virq; + +out_free_irq_data: + irq_domain_free_irq_data(virq, nr_irqs); +out_free_desc: + irq_free_descs(virq, nr_irqs); + return ret; +} + +/** + * irq_domain_free_irqs - Free IRQ number and associated data structures + * @virq: base IRQ number + * @nr_irqs: number of IRQs to free + */ +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *data = irq_get_irq_data(virq); + int i; + + if (WARN(!data || !data->domain || !data->domain->ops->free, + "NULL pointer, cannot free irq\n")) + return; + + mutex_lock(&irq_domain_mutex); + for (i = 0; i < nr_irqs; i++) + irq_domain_remove_irq(virq + i); + irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + mutex_unlock(&irq_domain_mutex); + + irq_domain_free_irq_data(virq, nr_irqs); + irq_free_descs(virq, nr_irqs); +} + +/** + * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to allocate + * @arg: Allocation data (arch/domain specific) + * + * Check whether the domain has been setup recursive. If not allocate + * through the parent domain. + */ +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs, + void *arg) +{ + /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ + if (irq_domain_is_auto_recursive(domain)) + return 0; + + domain = domain->parent; + if (domain) + return irq_domain_alloc_irqs_recursive(domain, irq_base, + nr_irqs, arg); + return -ENOSYS; +} + +/** + * irq_domain_free_irqs_parent - Free interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to free + * + * Check whether the domain has been setup recursive. If not free + * through the parent domain. + */ +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs) +{ + /* irq_domain_free_irqs_recursive() will call parent's free */ + if (!irq_domain_is_auto_recursive(domain) && domain->parent) + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); +} + +/** + * irq_domain_activate_irq - Call domain_ops->activate recursively to activate + * interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * This is the second step to call domain_ops->activate to program interrupt + * controllers, so the interrupt could actually get delivered. + */ +void irq_domain_activate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (irq_data->parent_data) + irq_domain_activate_irq(irq_data->parent_data); + if (domain->ops->activate) + domain->ops->activate(domain, irq_data); + } +} + +/** + * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to + * deactivate interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * It calls domain_ops->deactivate to program interrupt controllers to disable + * interrupt delivery. + */ +void irq_domain_deactivate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); + if (irq_data->parent_data) + irq_domain_deactivate_irq(irq_data->parent_data); + } +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ + /* Hierarchy irq_domains must implement callback alloc() */ + if (domain->ops->alloc) + domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; +} +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + return (irq_data && irq_data->domain == domain) ? irq_data : NULL; +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ +} +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61bf06a..e68932bb308e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Do not use this for shutdown scenarios where you must be sure * that all parts (hardirq and threaded handler) have completed. * + * Returns: false if a threaded handler is active. + * * This function may be called - with care - from IRQ context. */ -void synchronize_hardirq(unsigned int irq) +bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) + if (desc) { __synchronize_hardirq(desc); + return !atomic_read(&desc->threads_active); + } + + return true; } EXPORT_SYMBOL(synchronize_hardirq); @@ -183,6 +189,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); @@ -242,6 +249,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); + /* set the initial affinity to prevent every interrupt being on CPU0 */ + if (m) + __irq_set_affinity(irq, m, false); return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); @@ -382,14 +392,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +void __disable_irq(struct irq_desc *desc, unsigned int irq) { - if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) - return; - desc->istate |= IRQS_SUSPENDED; - } - if (!desc->depth++) irq_disable(desc); } @@ -401,7 +405,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq, false); + __disable_irq(desc, irq); irq_put_desc_busunlock(desc, flags); return 0; } @@ -442,20 +446,34 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); -void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) +/** + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this function while + * holding a resource the hard IRQ handler may need you will deadlock. + * + * When used to optimistically disable an interrupt from atomic context + * the return value must be checked. + * + * Returns: false if a threaded handler is active. + * + * This function may be called - with care - from IRQ context. + */ +bool disable_hardirq(unsigned int irq) { - if (resume) { - if (!(desc->istate & IRQS_SUSPENDED)) { - if (!desc->action) - return; - if (!(desc->action->flags & IRQF_FORCE_RESUME)) - return; - /* Pretend that it got disabled ! */ - desc->depth++; - } - desc->istate &= ~IRQS_SUSPENDED; - } + if (!__disable_irq_nosync(irq)) + return synchronize_hardirq(irq); + + return false; +} +EXPORT_SYMBOL_GPL(disable_hardirq); +void __enable_irq(struct irq_desc *desc, unsigned int irq) +{ switch (desc->depth) { case 0: err_out: @@ -497,7 +515,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); out: irq_put_desc_busunlock(desc, flags); } @@ -618,6 +636,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); @@ -1218,6 +1237,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; *old_ptr = new; + irq_pm_install_action(desc, new); + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -1228,7 +1249,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -1336,6 +1357,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; + irq_pm_remove_action(desc, action); + /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_shutdown(desc); @@ -1483,8 +1506,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). + * + * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and + * it cannot be set along with IRQF_NO_SUSPEND. */ - if ((irqflags & IRQF_SHARED) && !dev_id) + if (((irqflags & IRQF_SHARED) && !dev_id) || + (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || + ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); @@ -1770,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } + +/** + * irq_get_irqchip_state - returns the irqchip state of a interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: One of IRQCHIP_STATE_* the caller wants to know about + * @state: a pointer to a boolean where the state is to be storeed + * + * This call snapshots the internal irqchip state of an + * interrupt, returning into @state the bit corresponding to + * stage @which + * + * This function should be called with preemption disabled if the + * interrupt controller has per-cpu registers. + */ +int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, + bool *state) +{ + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + unsigned long flags; + int err = -EINVAL; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return err; + + data = irq_desc_get_irq_data(desc); + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_get_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_get_irqchip_state(data, which, state); + + irq_put_desc_busunlock(desc, flags); + return err; +} + +/** + * irq_set_irqchip_state - set the state of a forwarded interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * + * This function should be called with preemption disabled if the + * interrupt controller has per-cpu registers. + */ +int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, + bool val) +{ + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + unsigned long flags; + int err = -EINVAL; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return err; + + data = irq_desc_get_irq_data(desc); + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_set_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_set_irqchip_state(data, which, val); + + irq_put_desc_busunlock(desc, flags); + return err; +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..474de5cb394d --- /dev/null +++ b/kernel/irq/msi.c @@ -0,0 +1,337 @@ +/* + * linux/kernel/irq/msi.c + * + * Copyright (C) 2014 Intel Corp. + * Author: Jiang Liu <jiang.liu@linux.intel.com> + * + * This file is licensed under GPLv2. + * + * This file contains common code to support Message Signalled Interrupt for + * PCI compatible and non PCI compatible devices. + */ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/msi.h> + +/* Temparory solution for building, will be removed later */ +#include <linux/pci.h> + +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + +void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_get_msi_desc(irq); + + __get_cached_msi_msg(entry, msg); +} +EXPORT_SYMBOL_GPL(get_cached_msi_msg); + +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static inline void irq_chip_write_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + data->chip->irq_write_msi_msg(data, msg); +} + +/** + * msi_domain_set_affinity - Generic affinity setter function for MSI domains + * @irq_data: The irq data associated to the interrupt + * @mask: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Intended to be used by MSI interrupt controllers which are + * implemented with hierarchical domains. + */ +int msi_domain_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct msi_msg msg; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); + } + + return ret; +} + +static void msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static void msi_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + irq_hw_number_t hwirq = ops->get_hwirq(info, arg); + int i, ret; + + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + for (i = 0; i < nr_irqs; i++) { + ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); + if (ret < 0) { + if (ops->msi_free) { + for (i--; i > 0; i--) + ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); + return ret; + } + } + + return 0; +} + +static void msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct msi_domain_info *info = domain->host_data; + int i; + + if (info->ops->msi_free) { + for (i = 0; i < nr_irqs; i++) + info->ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static struct irq_domain_ops msi_domain_ops = { + .alloc = msi_domain_alloc, + .free = msi_domain_free, + .activate = msi_domain_activate, + .deactivate = msi_domain_deactivate, +}; + +#ifdef GENERIC_MSI_DOMAIN_OPS +static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->hwirq; +} + +static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, + struct msi_desc *desc) +{ + arg->desc = desc; +} +#else +#define msi_domain_ops_get_hwirq NULL +#define msi_domain_ops_prepare NULL +#define msi_domain_ops_set_desc NULL +#endif /* !GENERIC_MSI_DOMAIN_OPS */ + +static int msi_domain_ops_init(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, + info->chip_data); + if (info->handler && info->handler_name) { + __irq_set_handler(virq, info->handler, 0, info->handler_name); + if (info->handler_data) + irq_set_handler_data(virq, info->handler_data); + } + return 0; +} + +static int msi_domain_ops_check(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + return 0; +} + +static struct msi_domain_ops msi_domain_ops_default = { + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, +}; + +static void msi_domain_update_dom_ops(struct msi_domain_info *info) +{ + struct msi_domain_ops *ops = info->ops; + + if (ops == NULL) { + info->ops = &msi_domain_ops_default; + return; + } + + if (ops->get_hwirq == NULL) + ops->get_hwirq = msi_domain_ops_default.get_hwirq; + if (ops->msi_init == NULL) + ops->msi_init = msi_domain_ops_default.msi_init; + if (ops->msi_check == NULL) + ops->msi_check = msi_domain_ops_default.msi_check; + if (ops->msi_prepare == NULL) + ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->set_desc == NULL) + ops->set_desc = msi_domain_ops_default.set_desc; +} + +static void msi_domain_update_chip_ops(struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + BUG_ON(!chip); + if (!chip->irq_mask) + chip->irq_mask = pci_msi_mask_irq; + if (!chip->irq_unmask) + chip->irq_unmask = pci_msi_unmask_irq; + if (!chip->irq_set_affinity) + chip->irq_set_affinity = msi_domain_set_affinity; +} + +/** + * msi_create_irq_domain - Create a MSI interrupt domain + * @of_node: Optional device-tree node of the interrupt controller + * @info: MSI domain info + * @parent: Parent irq domain + */ +struct irq_domain *msi_create_irq_domain(struct device_node *node, + struct msi_domain_info *info, + struct irq_domain *parent) +{ + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, + info); +} + +/** + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. + */ +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + msi_alloc_info_t arg; + struct msi_desc *desc; + int i, ret, virq = -1; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, &arg); + if (ret) + return ret; + + for_each_msi_entry(desc, dev) { + ops->set_desc(&arg, desc); + if (info->flags & MSI_FLAG_IDENTITY_MAP) + virq = (int)ops->get_hwirq(info, &arg); + else + virq = -1; + + virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + dev_to_node(dev), &arg, false); + if (virq < 0) { + ret = -ENOSPC; + if (ops->handle_error) + ret = ops->handle_error(domain, desc, ret); + if (ops->msi_finish) + ops->msi_finish(&arg, ret); + return ret; + } + + for (i = 0; i < desc->nvec_used; i++) + irq_set_msi_desc_off(virq, i, desc); + } + + if (ops->msi_finish) + ops->msi_finish(&arg, 0); + + for_each_msi_entry(desc, dev) { + if (desc->nvec_used == 1) + dev_dbg(dev, "irq %d for MSI\n", virq); + else + dev_dbg(dev, "irq [%d-%d] for MSI\n", + virq, virq + desc->nvec_used - 1); + } + + return 0; +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_desc *desc; + + for_each_msi_entry(desc, dev) { + /* + * We might have failed to allocate an MSI early + * enough that there is no IRQ associated to this + * entry. If that's the case, don't do anything. + */ + if (desc->irq) { + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; + } + } +} + +/** + * msi_get_domain_info - Get the MSI interrupt domain info for @domain + * @domain: The interrupt domain to retrieve data from + * + * Returns the pointer to the msi_domain_info stored in + * @domain->host_data. + */ +struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) +{ + return (struct msi_domain_info *)domain->host_data; +} + +#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca86cb7..5204a6d1b985 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,17 +9,110 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" +bool irq_pm_check_wakeup(struct irq_desc *desc) +{ + if (irqd_is_wakeup_armed(&desc->irq_data)) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_wakeup(); + return true; + } + return false; +} + +/* + * Called from __setup_irq() with desc->lock held after @action has + * been installed in the action chain. + */ +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions++; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth++; + + WARN_ON_ONCE(desc->force_resume_depth && + desc->force_resume_depth != desc->nr_actions); + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth++; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth++; + + WARN_ON_ONCE(desc->no_suspend_depth && + (desc->no_suspend_depth + + desc->cond_suspend_depth) != desc->nr_actions); +} + +/* + * Called from __free_irq() with desc->lock held after @action has + * been removed from the action chain. + */ +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions--; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth--; + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth--; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth--; +} + +static bool suspend_device_irq(struct irq_desc *desc, int irq) +{ + if (!desc->action || desc->no_suspend_depth) + return false; + + if (irqd_is_wakeup_set(&desc->irq_data)) { + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + /* + * We return true here to force the caller to issue + * synchronize_irq(). We need to make sure that the + * IRQD_WAKEUP_ARMED is visible before we return from + * suspend_device_irqs(). + */ + return true; + } + + desc->istate |= IRQS_SUSPENDED; + __disable_irq(desc, irq); + + /* + * Hardware which has no wakeup source configuration facility + * requires that the non wakeup interrupts are masked at the + * chip level. The chip implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + return true; +} + /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device drivers need to be prevented - * from receiving interrupts and this function is provided for this purpose. - * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQS_SUSPENDED flag for each of them. + * During system-wide suspend or hibernation device drivers need to be + * prevented from receiving interrupts and this function is provided + * for this purpose. + * + * So we disable all interrupts and mark them IRQS_SUSPENDED except + * for those which are unused, those which are marked as not + * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND + * set and those which are marked as active wakeup sources. + * + * The active wakeup sources are handled by the flow handler entry + * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the + * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { @@ -28,18 +121,36 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; + bool sync; raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, true); + sync = suspend_device_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - } - for_each_irq_desc(irq, desc) - if (desc->istate & IRQS_SUSPENDED) + if (sync) synchronize_irq(irq); + } } EXPORT_SYMBOL_GPL(suspend_device_irqs); +static void resume_irq(struct irq_desc *desc, int irq) +{ + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + + if (desc->istate & IRQS_SUSPENDED) + goto resume; + + /* Force resume the interrupt? */ + if (!desc->force_resume_depth) + return; + + /* Pretend that it got disabled ! */ + desc->depth++; +resume: + desc->istate &= ~IRQS_SUSPENDED; + __enable_irq(desc, irq); +} + static void resume_irqs(bool want_early) { struct irq_desc *desc; @@ -54,7 +165,7 @@ static void resume_irqs(bool want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq, true); + resume_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -93,38 +204,3 @@ void resume_device_irqs(void) resume_irqs(false); } EXPORT_SYMBOL_GPL(resume_device_irqs); - -/** - * check_wakeup_irqs - check if any wake-up interrupts are pending - */ -int check_wakeup_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - /* - * Only interrupts which are marked as wakeup source - * and have not been disabled before the suspend check - * can abort suspend. - */ - if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->depth == 1 && desc->istate & IRQS_PENDING) - return -EBUSY; - continue; - } - /* - * Check the non wakeup interrupts whether they need - * to be masked before finally going into suspend - * state. That's for hardware which has no wakeup - * source configuration facility. The chip - * implementation indicates that with - * IRQCHIP_MASK_ON_SUSPEND. - */ - if (desc->istate & IRQS_SUSPENDED && - irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) - mask_irq(desc); - } - - return 0; -} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ac1ba2f11032..df2f4642d1e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -15,6 +15,23 @@ #include "internals.h" +/* + * Access rules: + * + * procfs protects read/write of /proc/irq/N/ files against a + * concurrent free of the interrupt descriptor. remove_proc_entry() + * immediately prevents new read/writes to happen and waits for + * already running read/write functions to complete. + * + * We remove the proc entries first and then delete the interrupt + * descriptor from the radix tree and free it. So it is guaranteed + * that irq_to_desc(N) is valid as long as the read/writes are + * permitted by procfs. + * + * The read from /proc/interrupts is a different problem because there + * is no protection. So the lookup and the access to irqdesc + * information must be protected by sparse_irq_lock. + */ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP @@ -29,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v) mask = desc->pending_mask; #endif if (type) - seq_cpumask_list(m, mask); + seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); else - seq_cpumask(m, mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); return 0; } @@ -50,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) cpumask_copy(mask, desc->affinity_hint); raw_spin_unlock_irqrestore(&desc->lock, flags); - seq_cpumask(m, mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return 0; @@ -169,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, irq_default_affinity); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); return 0; } @@ -437,9 +451,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } + irq_lock_sparse(); desc = irq_to_desc(i); if (!desc) - return 0; + goto outsparse; raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) @@ -479,6 +494,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: raw_spin_unlock_irqrestore(&desc->lock, flags); +outsparse: + irq_unlock_sparse(); return 0; } #endif diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e2fa78..cbf9fb899d92 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -16,11 +16,12 @@ #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/smp.h> #include <asm/processor.h> -static DEFINE_PER_CPU(struct llist_head, irq_work_list); -static DEFINE_PER_CPU(int, irq_work_raised); +static DEFINE_PER_CPU(struct llist_head, raised_list); +static DEFINE_PER_CPU(struct llist_head, lazy_list); /* * Claim the entry so that no one else will poke at it. @@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) */ } +#ifdef CONFIG_SMP /* - * Enqueue the irq_work @entry unless it's already pending + * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ +bool irq_work_queue_on(struct irq_work *work, int cpu) +{ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return false; + + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); + + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue_on); +#endif + +/* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ @@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work) /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - - /* - * If the work is not "lazy" or the tick is stopped, raise the irq - * work interrupt (if supported by the arch), otherwise, just wait - * for the next tick. - */ - if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { - if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } @@ -90,11 +111,14 @@ EXPORT_SYMBOL_GPL(irq_work_queue); bool irq_work_needs_cpu(void) { - struct llist_head *this_list; + struct llist_head *raised, *lazy; - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) - return false; + raised = this_cpu_ptr(&raised_list); + lazy = this_cpu_ptr(&lazy_list); + + if (llist_empty(raised) || arch_irq_work_has_interrupt()) + if (llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -102,28 +126,18 @@ bool irq_work_needs_cpu(void) return true; } -static void __irq_work_run(void) +static void irq_work_run_list(struct llist_head *list) { unsigned long flags; struct irq_work *work; - struct llist_head *this_list; struct llist_node *llnode; + BUG_ON(!irqs_disabled()); - /* - * Reset the "raised" state right before we check the list because - * an NMI may enqueue after we find the list empty from the runner. - */ - __this_cpu_write(irq_work_raised, 0); - barrier(); - - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) + if (llist_empty(list)) return; - BUG_ON(!irqs_disabled()); - - llnode = llist_del_all(this_list); + llnode = llist_del_all(list); while (llnode != NULL) { work = llist_entry(llnode, struct irq_work, llnode); @@ -149,16 +163,25 @@ static void __irq_work_run(void) } /* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. + * hotplug calls this through: + * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { - BUG_ON(!in_irq()); - __irq_work_run(); + irq_work_run_list(this_cpu_ptr(&raised_list)); + irq_work_run_list(this_cpu_ptr(&lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); +void irq_work_tick(void) +{ + struct llist_head *raised = this_cpu_ptr(&raised_list); + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. @@ -171,35 +194,3 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); - -#ifdef CONFIG_HOTPLUG_CPU -static int irq_work_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_DYING: - /* Called from stop_machine */ - if (WARN_ON_ONCE(cpu != smp_processor_id())) - break; - __irq_work_run(); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_notify; - -static __init int irq_work_init_cpu_notifier(void) -{ - cpu_notify.notifier_call = irq_work_cpu_notify; - cpu_notify.priority = 0; - register_cpu_notifier(&cpu_notify); - return 0; -} -device_initcall(irq_work_init_cpu_notifier); - -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..5c5987f10819 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); @@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file) * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; - int ret; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); - ret = seq_open(file, &kallsyms_op); - if (ret == 0) - ((struct seq_file *)file->private_data)->private = iter; - else - kfree(iter); - return ret; + return 0; } #ifdef CONFIG_KGDB_KDB diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0fe61c3..0aa69ea1d8fd 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { - long ret; + long t1, t2; - ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + t1 = kptr_obfuscate((long)v1, type); + t2 = kptr_obfuscate((long)v2, type); - return (ret < 0) | ((ret > 0) << 1); + return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..38c25b1f2fd5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -40,6 +42,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +#ifdef CONFIG_KEXEC_FILE +static int kexec_calculate_store_digests(struct kimage *image); +#endif + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +#ifdef CONFIG_KEXEC_FILE +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; } + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and command line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -583,8 +856,6 @@ static int kimage_set_destination(struct kimage *image, destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; return result; } @@ -596,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page) page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; return result; } @@ -609,7 +878,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -663,6 +932,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -732,7 +1009,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1068,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -826,7 +1107,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -834,7 +1119,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -851,10 +1139,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -877,7 +1169,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -886,7 +1183,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -984,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > 0) { unsigned long i; - /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); - /* Loading another kernel to switch to if this one crashes */ - else if (flags & KEXEC_ON_CRASH) { - /* Free any current crash dump kernel before + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before * we corrupt it. */ + kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); + } else { + /* Loading another kernel to reboot into. */ + + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); } if (result) goto out; @@ -1077,6 +1380,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1376,7 +1758,6 @@ static __initdata char *suffix_tbl[] = { */ static int __init parse_crashkernel_suffix(char *cmdline, unsigned long long *crash_size, - unsigned long long *crash_base, const char *suffix) { char *cur = cmdline; @@ -1465,7 +1846,7 @@ static int __init __parse_crashkernel(char *cmdline, if (suffix) return parse_crashkernel_suffix(ck_cmdline, crash_size, - crash_base, suffix); + suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1632,6 +2013,672 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective architecture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..2777f40a9c7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -47,13 +47,6 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -/* - * kmod_thread_locker is used for deadlock avoidance. There is no explicit - * locking to protect this global - it is private to the singleton khelper - * thread and should only ever be modified by that thread. - */ -static const struct task_struct *kmod_thread_locker; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ @@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } -static int call_helper(void *data) -{ - /* Worker thread started blocking khelper thread. */ - kmod_thread_locker = current; - return ____call_usermodehelper(data); -} - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - /* CLONE_VFORK: wait until the usermode helper has execve'd - * successfully We need the data structures to stay around - * until that is done. */ - if (wait == UMH_WAIT_PROC) + if (sub_info->wait & UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else { - pid = kernel_thread(call_helper, sub_info, - CLONE_VFORK | SIGCHLD); - /* Worker thread stopped blocking khelper thread. */ - kmod_thread_locker = NULL; - } - - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; + else + pid = kernel_thread(____call_usermodehelper, sub_info, + SIGCHLD); - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } /* - * Worker thread must not wait for khelper thread at below - * wait_for_completion() if the thread was created with CLONE_VFORK - * flag, for khelper thread is already waiting for the thread at - * wait_for_completion() in do_fork(). + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. */ - if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { - retval = -EBUSY; - goto out; - } - - sub_info->complete = &done; + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..c90e417bb963 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) static void free_insn_page(void *page) { - module_free(NULL, page); + module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { @@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p) struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ @@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); return &op->kp; } @@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; - unoptimize_kprobe(p, false); /* Try to unoptimize */ + /* Try to unoptimize */ + unoptimize_kprobe(p, kprobes_all_disarmed); if (!kprobe_queued(p)) { arch_disarm_kprobe(p); @@ -915,7 +916,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, - .flags = FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; static int kprobe_ftrace_enabled; @@ -1410,16 +1411,10 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +int __weak arch_check_ftrace_location(struct kprobe *p) { - int ret = 0; unsigned long ftrace_addr; - /* - * If the address is located on a ftrace nop, set the - * breakpoint to the following instruction. - */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { #ifdef CONFIG_KPROBES_ON_FTRACE @@ -1431,7 +1426,17 @@ static int check_kprobe_address_safe(struct kprobe *p, return -EINVAL; #endif } + return 0; +} +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) +{ + int ret; + + ret = arch_check_ftrace_location(p); + if (ret) + return ret; jump_label_lock(); preempt_disable(); @@ -1567,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p, true); + /* + * If kprobes_all_disarmed is set, orig_p + * should have already been disarmed, so + * skip unneed disarming process. + */ + if (!kprobes_all_disarmed) + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1778,7 +1789,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /*TODO: consider to only swap the RA after the last pre_handler fired */ + /* + * To avoid deadlocks, prohibit return probing in NMI contexts, + * just skip the probe and increase the (inexact) 'nmissed' + * statistical counter, so that the user is informed that + * something happened: + */ + if (unlikely(in_nmi())) { + rp->nmissed++; + return 0; + } + + /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { @@ -2305,6 +2327,12 @@ static void arm_all_kprobes(void) if (!kprobes_all_disarmed) goto already_enabled; + /* + * optimize_kprobe() called by arm_kprobe() checks + * kprobes_all_disarmed, so set kprobes_all_disarmed before + * arm_kprobe. + */ + kprobes_all_disarmed = false; /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -2313,7 +2341,6 @@ static void arm_all_kprobes(void) arm_kprobe(p); } - kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); already_enabled: diff --git a/kernel/kthread.c b/kernel/kthread.c index c2390f41307b..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; @@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker, list_add_tail(&work->node, pos); work->worker = worker; - if (likely(worker->task)) + if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig new file mode 100644 index 000000000000..045022557936 --- /dev/null +++ b/kernel/livepatch/Kconfig @@ -0,0 +1,18 @@ +config HAVE_LIVEPATCH + bool + help + Arch supports kernel live patching + +config LIVEPATCH + bool "Kernel Live Patching" + depends on DYNAMIC_FTRACE_WITH_REGS + depends on MODULES + depends on SYSFS + depends on KALLSYMS_ALL + depends on HAVE_LIVEPATCH + help + Say Y here if you want to support kernel live patching. + This option has no runtime impact until a kernel "patch" + module uses the interface provided by this option to register + a patch, causing calls to patched functions to be redirected + to new function code contained in the patch module. diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile new file mode 100644 index 000000000000..e8780c0901d9 --- /dev/null +++ b/kernel/livepatch/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LIVEPATCH) += livepatch.o + +livepatch-objs := core.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c new file mode 100644 index 000000000000..284e2691e380 --- /dev/null +++ b/kernel/livepatch/core.c @@ -0,0 +1,1003 @@ +/* + * core.c - Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/ftrace.h> +#include <linux/list.h> +#include <linux/kallsyms.h> +#include <linux/livepatch.h> + +/** + * struct klp_ops - structure for tracking registered ftrace ops structs + * + * A single ftrace_ops is shared between all enabled replacement functions + * (klp_func structs) which have the same old_addr. This allows the switch + * between function versions to happen instantaneously by updating the klp_ops + * struct's func_stack list. The winner is the klp_func at the top of the + * func_stack (front of the list). + * + * @node: node for the global klp_ops list + * @func_stack: list head for the stack of klp_func's (active func is on top) + * @fops: registered ftrace ops struct + */ +struct klp_ops { + struct list_head node; + struct list_head func_stack; + struct ftrace_ops fops; +}; + +/* + * The klp_mutex protects the global lists and state transitions of any + * structure reachable from them. References to any structure must be obtained + * under mutex protection (except in klp_ftrace_handler(), which uses RCU to + * ensure it gets consistent data). + */ +static DEFINE_MUTEX(klp_mutex); + +static LIST_HEAD(klp_patches); +static LIST_HEAD(klp_ops); + +static struct kobject *klp_root_kobj; + +static struct klp_ops *klp_find_ops(unsigned long old_addr) +{ + struct klp_ops *ops; + struct klp_func *func; + + list_for_each_entry(ops, &klp_ops, node) { + func = list_first_entry(&ops->func_stack, struct klp_func, + stack_node); + if (func->old_addr == old_addr) + return ops; + } + + return NULL; +} + +static bool klp_is_module(struct klp_object *obj) +{ + return obj->name; +} + +static bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +/* sets obj->mod if object is not vmlinux and module is found */ +static void klp_find_object_module(struct klp_object *obj) +{ + struct module *mod; + + if (!klp_is_module(obj)) + return; + + mutex_lock(&module_mutex); + /* + * We do not want to block removal of patched modules and therefore + * we do not take a reference here. The patches are removed by + * a going module handler instead. + */ + mod = find_module(obj->name); + /* + * Do not mess work of the module coming and going notifiers. + * Note that the patch might still be needed before the going handler + * is called. Module functions can be called even in the GOING state + * until mod->exit() finishes. This is especially important for + * patches that modify semantic of the functions. + */ + if (mod && mod->klp_alive) + obj->mod = mod; + + mutex_unlock(&module_mutex); +} + +/* klp_mutex must be held by caller */ +static bool klp_is_patch_registered(struct klp_patch *patch) +{ + struct klp_patch *mypatch; + + list_for_each_entry(mypatch, &klp_patches, list) + if (mypatch == patch) + return true; + + return false; +} + +static bool klp_initialized(void) +{ + return klp_root_kobj; +} + +struct klp_find_arg { + const char *objname; + const char *name; + unsigned long addr; + /* + * If count == 0, the symbol was not found. If count == 1, a unique + * match was found and addr is set. If count > 1, there is + * unresolvable ambiguity among "count" number of symbols with the same + * name in the same object. + */ + unsigned long count; +}; + +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + /* + * args->addr might be overwritten if another match is found + * but klp_find_object_symbol() handles this and only returns the + * addr if count == 1. + */ + args->addr = addr; + args->count++; + + return 0; +} + +static int klp_find_object_symbol(const char *objname, const char *name, + unsigned long *addr) +{ + struct klp_find_arg args = { + .objname = objname, + .name = name, + .addr = 0, + .count = 0 + }; + + kallsyms_on_each_symbol(klp_find_callback, &args); + + if (args.count == 0) + pr_err("symbol '%s' not found in symbol table\n", name); + else if (args.count > 1) + pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", + args.count, name, objname); + else { + *addr = args.addr; + return 0; + } + + *addr = 0; + return -EINVAL; +} + +struct klp_verify_args { + const char *name; + const unsigned long addr; +}; + +static int klp_verify_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_verify_args *args = data; + + if (!mod && + !strcmp(args->name, name) && + args->addr == addr) + return 1; + + return 0; +} + +static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) +{ + struct klp_verify_args args = { + .name = name, + .addr = addr, + }; + + if (kallsyms_on_each_symbol(klp_verify_callback, &args)) + return 0; + + pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", + name, addr); + return -EINVAL; +} + +static int klp_find_verify_func_addr(struct klp_object *obj, + struct klp_func *func) +{ + int ret; + +#if defined(CONFIG_RANDOMIZE_BASE) + /* KASLR is enabled, disregard old_addr from user */ + func->old_addr = 0; +#endif + + if (!func->old_addr || klp_is_module(obj)) + ret = klp_find_object_symbol(obj->name, func->old_name, + &func->old_addr); + else + ret = klp_verify_vmlinux_symbol(func->old_name, + func->old_addr); + + return ret; +} + +/* + * external symbols are located outside the parent object (where the parent + * object is either vmlinux or the kmod being patched). + */ +static int klp_find_external_symbol(struct module *pmod, const char *name, + unsigned long *addr) +{ + const struct kernel_symbol *sym; + + /* first, check if it's an exported symbol */ + preempt_disable(); + sym = find_symbol(name, NULL, NULL, true, true); + if (sym) { + *addr = sym->value; + preempt_enable(); + return 0; + } + preempt_enable(); + + /* otherwise check if it's in another .o within the patch module */ + return klp_find_object_symbol(pmod->name, name, addr); +} + +static int klp_write_object_relocations(struct module *pmod, + struct klp_object *obj) +{ + int ret; + struct klp_reloc *reloc; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + if (WARN_ON(!obj->relocs)) + return -EINVAL; + + for (reloc = obj->relocs; reloc->name; reloc++) { + if (!klp_is_module(obj)) { + ret = klp_verify_vmlinux_symbol(reloc->name, + reloc->val); + if (ret) + return ret; + } else { + /* module, reloc->val needs to be discovered */ + if (reloc->external) + ret = klp_find_external_symbol(pmod, + reloc->name, + &reloc->val); + else + ret = klp_find_object_symbol(obj->mod->name, + reloc->name, + &reloc->val); + if (ret) + return ret; + } + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, + reloc->val + reloc->addend); + if (ret) { + pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", + reloc->name, reloc->val, ret); + return ret; + } + } + + return 0; +} + +static void notrace klp_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct pt_regs *regs) +{ + struct klp_ops *ops; + struct klp_func *func; + + ops = container_of(fops, struct klp_ops, fops); + + rcu_read_lock(); + func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, + stack_node); + if (WARN_ON_ONCE(!func)) + goto unlock; + + klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); +} + +static void klp_disable_func(struct klp_func *func) +{ + struct klp_ops *ops; + + WARN_ON(func->state != KLP_ENABLED); + WARN_ON(!func->old_addr); + + ops = klp_find_ops(func->old_addr); + if (WARN_ON(!ops)) + return; + + if (list_is_singular(&ops->func_stack)) { + WARN_ON(unregister_ftrace_function(&ops->fops)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); + + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + } else { + list_del_rcu(&func->stack_node); + } + + func->state = KLP_DISABLED; +} + +static int klp_enable_func(struct klp_func *func) +{ + struct klp_ops *ops; + int ret; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + if (WARN_ON(func->state != KLP_DISABLED)) + return -EINVAL; + + ops = klp_find_ops(func->old_addr); + if (!ops) { + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->fops.func = klp_ftrace_handler; + ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; + + list_add(&ops->node, &klp_ops); + + INIT_LIST_HEAD(&ops->func_stack); + list_add_rcu(&func->stack_node, &ops->func_stack); + + ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + goto err; + } + + ret = register_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + goto err; + } + + + } else { + list_add_rcu(&func->stack_node, &ops->func_stack); + } + + func->state = KLP_ENABLED; + + return 0; + +err: + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + return ret; +} + +static void klp_disable_object(struct klp_object *obj) +{ + struct klp_func *func; + + for (func = obj->funcs; func->old_name; func++) + if (func->state == KLP_ENABLED) + klp_disable_func(func); + + obj->state = KLP_DISABLED; +} + +static int klp_enable_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (WARN_ON(obj->state != KLP_DISABLED)) + return -EINVAL; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_enable_func(func); + if (ret) { + klp_disable_object(obj); + return ret; + } + } + obj->state = KLP_ENABLED; + + return 0; +} + +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + + /* enforce stacking: only the last enabled patch can be disabled */ + if (!list_is_last(&patch->list, &klp_patches) && + list_next_entry(patch, list)->state == KLP_ENABLED) + return -EBUSY; + + pr_notice("disabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + if (obj->state == KLP_ENABLED) + klp_disable_object(obj); + } + + patch->state = KLP_DISABLED; + + return 0; +} + +/** + * klp_disable_patch() - disables a registered patch + * @patch: The registered, enabled patch to be disabled + * + * Unregisters the patched functions from ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_disable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + if (patch->state == KLP_DISABLED) { + ret = -EINVAL; + goto err; + } + + ret = __klp_disable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_disable_patch); + +static int __klp_enable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (WARN_ON(patch->state != KLP_DISABLED)) + return -EINVAL; + + /* enforce stacking: only the first disabled patch can be enabled */ + if (patch->list.prev != &klp_patches && + list_prev_entry(patch, list)->state == KLP_DISABLED) + return -EBUSY; + + pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); + add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + + pr_notice("enabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_enable_object(obj); + if (ret) + goto unregister; + } + + patch->state = KLP_ENABLED; + + return 0; + +unregister: + WARN_ON(__klp_disable_patch(patch)); + return ret; +} + +/** + * klp_enable_patch() - enables a registered patch + * @patch: The registered, disabled patch to be enabled + * + * Performs the needed symbol lookups and code relocations, + * then registers the patched functions with ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_enable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + ret = __klp_enable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * Sysfs Interface + * + * /sys/kernel/livepatch + * /sys/kernel/livepatch/<patch> + * /sys/kernel/livepatch/<patch>/enabled + * /sys/kernel/livepatch/<patch>/<object> + * /sys/kernel/livepatch/<patch>/<object>/<func> + */ + +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + unsigned long val; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return -EINVAL; + + if (val != KLP_DISABLED && val != KLP_ENABLED) + return -EINVAL; + + patch = container_of(kobj, struct klp_patch, kobj); + + mutex_lock(&klp_mutex); + + if (val == patch->state) { + /* already in requested state */ + ret = -EINVAL; + goto err; + } + + if (val == KLP_ENABLED) { + ret = __klp_enable_patch(patch); + if (ret) + goto err; + } else { + ret = __klp_disable_patch(patch); + if (ret) + goto err; + } + + mutex_unlock(&klp_mutex); + + return count; + +err: + mutex_unlock(&klp_mutex); + return ret; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); +} + +static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); +static struct attribute *klp_patch_attrs[] = { + &enabled_kobj_attr.attr, + NULL +}; + +static void klp_kobj_release_patch(struct kobject *kobj) +{ + /* + * Once we have a consistency model we'll need to module_put() the + * patch module here. See klp_register_patch() for more details. + */ +} + +static struct kobj_type klp_ktype_patch = { + .release = klp_kobj_release_patch, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = klp_patch_attrs, +}; + +static void klp_kobj_release_func(struct kobject *kobj) +{ +} + +static struct kobj_type klp_ktype_func = { + .release = klp_kobj_release_func, + .sysfs_ops = &kobj_sysfs_ops, +}; + +/* + * Free all functions' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_funcs_limited(struct klp_object *obj, + struct klp_func *limit) +{ + struct klp_func *func; + + for (func = obj->funcs; func->old_name && func != limit; func++) + kobject_put(&func->kobj); +} + +/* Clean up when a patched object is unloaded */ +static void klp_free_object_loaded(struct klp_object *obj) +{ + struct klp_func *func; + + obj->mod = NULL; + + for (func = obj->funcs; func->old_name; func++) + func->old_addr = 0; +} + +/* + * Free all objects' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_objects_limited(struct klp_patch *patch, + struct klp_object *limit) +{ + struct klp_object *obj; + + for (obj = patch->objs; obj->funcs && obj != limit; obj++) { + klp_free_funcs_limited(obj, NULL); + kobject_put(obj->kobj); + } +} + +static void klp_free_patch(struct klp_patch *patch) +{ + klp_free_objects_limited(patch, NULL); + if (!list_empty(&patch->list)) + list_del(&patch->list); + kobject_put(&patch->kobj); +} + +static int klp_init_func(struct klp_object *obj, struct klp_func *func) +{ + INIT_LIST_HEAD(&func->stack_node); + func->state = KLP_DISABLED; + + return kobject_init_and_add(&func->kobj, &klp_ktype_func, + obj->kobj, "%s", func->old_name); +} + +/* parts of the initialization that is done only when the object is loaded */ +static int klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (obj->relocs) { + ret = klp_write_object_relocations(patch->mod, obj); + if (ret) + return ret; + } + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_find_verify_func_addr(obj, func); + if (ret) + return ret; + } + + return 0; +} + +static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) +{ + struct klp_func *func; + int ret; + const char *name; + + if (!obj->funcs) + return -EINVAL; + + obj->state = KLP_DISABLED; + obj->mod = NULL; + + klp_find_object_module(obj); + + name = klp_is_module(obj) ? obj->name : "vmlinux"; + obj->kobj = kobject_create_and_add(name, &patch->kobj); + if (!obj->kobj) + return -ENOMEM; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_init_func(obj, func); + if (ret) + goto free; + } + + if (klp_is_object_loaded(obj)) { + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto free; + } + + return 0; + +free: + klp_free_funcs_limited(obj, func); + kobject_put(obj->kobj); + return ret; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (!patch->objs) + return -EINVAL; + + mutex_lock(&klp_mutex); + + patch->state = KLP_DISABLED; + + ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, + klp_root_kobj, "%s", patch->mod->name); + if (ret) + goto unlock; + + for (obj = patch->objs; obj->funcs; obj++) { + ret = klp_init_object(patch, obj); + if (ret) + goto free; + } + + list_add_tail(&patch->list, &klp_patches); + + mutex_unlock(&klp_mutex); + + return 0; + +free: + klp_free_objects_limited(patch, obj); + kobject_put(&patch->kobj); +unlock: + mutex_unlock(&klp_mutex); + return ret; +} + +/** + * klp_unregister_patch() - unregisters a patch + * @patch: Disabled patch to be unregistered + * + * Frees the data structures and removes the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_unregister_patch(struct klp_patch *patch) +{ + int ret = 0; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto out; + } + + if (patch->state == KLP_ENABLED) { + ret = -EBUSY; + goto out; + } + + klp_free_patch(patch); + +out: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_unregister_patch); + +/** + * klp_register_patch() - registers a patch + * @patch: Patch to be registered + * + * Initializes the data structure associated with the patch and + * creates the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_register_patch(struct klp_patch *patch) +{ + int ret; + + if (!klp_initialized()) + return -ENODEV; + + if (!patch || !patch->mod) + return -EINVAL; + + /* + * A reference is taken on the patch module to prevent it from being + * unloaded. Right now, we don't allow patch modules to unload since + * there is currently no method to determine if a thread is still + * running in the patched code contained in the patch module once + * the ftrace registration is successful. + */ + if (!try_module_get(patch->mod)) + return -ENODEV; + + ret = klp_init_patch(patch); + if (ret) + module_put(patch->mod); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_register_patch); + +static void klp_module_notify_coming(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + int ret; + + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto err; + + if (patch->state == KLP_DISABLED) + return; + + pr_notice("applying patch '%s' to loading module '%s'\n", + pmod->name, mod->name); + + ret = klp_enable_object(obj); + if (!ret) + return; + +err: + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + pmod->name, mod->name, ret); +} + +static void klp_module_notify_going(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + + if (patch->state == KLP_DISABLED) + goto disabled; + + pr_notice("reverting patch '%s' on unloading module '%s'\n", + pmod->name, mod->name); + + klp_disable_object(obj); + +disabled: + klp_free_object_loaded(obj); +} + +static int klp_module_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct module *mod = data; + struct klp_patch *patch; + struct klp_object *obj; + + if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) + return 0; + + mutex_lock(&klp_mutex); + + /* + * Each module has to know that the notifier has been called. + * We never know what module will get patched by a new patch. + */ + if (action == MODULE_STATE_COMING) + mod->klp_alive = true; + else /* MODULE_STATE_GOING */ + mod->klp_alive = false; + + list_for_each_entry(patch, &klp_patches, list) { + for (obj = patch->objs; obj->funcs; obj++) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; + + if (action == MODULE_STATE_COMING) { + obj->mod = mod; + klp_module_notify_coming(patch, obj); + } else /* MODULE_STATE_GOING */ + klp_module_notify_going(patch, obj); + + break; + } + } + + mutex_unlock(&klp_mutex); + + return 0; +} + +static struct notifier_block klp_module_nb = { + .notifier_call = klp_module_notify, + .priority = INT_MIN+1, /* called late but before ftrace notifier */ +}; + +static int klp_init(void) +{ + int ret; + + ret = klp_check_compiler_support(); + if (ret) { + pr_info("Your compiler is too old; turning off.\n"); + return -EINVAL; + } + + ret = register_module_notifier(&klp_module_nb); + if (ret) + return ret; + + klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); + if (!klp_root_kobj) { + ret = -ENOMEM; + goto unregister; + } + + return 0; + +unregister: + unregister_module_notifier(&klp_module_nb); + return ret; +} + +module_init(klp_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..de7a416cca2a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,11 +1,11 @@ -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d24e4339b46d..a0831e1b99f4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +#ifdef CONFIG_LOCK_STAT printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +#endif } static int save_trace(struct stack_trace *trace) @@ -549,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock) static void print_lock(struct held_lock *hlock) { - print_lock_name(hlock_class(hlock)); + /* + * We can be called locklessly through debug_show_all_locks() so be + * extra careful, the hlock might have been released and cleared. + */ + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ + barrier(); + + if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + printk("<RELEASED>\n"); + return; + } + + print_lock_name(lock_classes + class_idx - 1); printk(", at: "); print_ip_sym(hlock->acquire_ip); } @@ -631,7 +647,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -698,10 +714,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) hash_head = classhashentry(key); /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: + * We do an RCU walk of the hash, see lockdep_free_key_range(). */ - list_for_each_entry(class, hash_head, hash_entry) { + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return NULL; + + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -726,7 +744,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; - unsigned long flags; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -748,28 +767,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry(class, hash_head, hash_entry) + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; + } + /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); return NULL; } - raw_local_irq_restore(flags); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); @@ -796,7 +813,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) @@ -804,15 +820,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\n"); dump_stack(); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); - raw_local_irq_restore(flags); out_set_class_cache: if (!subclass || force) @@ -868,11 +881,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, entry->distance = distance; entry->trace = *trace; /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * Both allocation and removal are done under the graph lock; but + * iteration is under RCU-sched; see look_up_lock_class() and + * lockdep_free_key_range(). */ list_add_tail_rcu(&entry->entry, head); @@ -1023,7 +1034,9 @@ static int __bfs(struct lock_list *source_entry, else head = &lock->class->locks_before; - list_for_each_entry(entry, head, entry) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; mark_lock_accessed(entry, lock); @@ -2020,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry(chain, hash_head, entry) { + list_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2994,8 +3007,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (unlikely(!debug_locks)) return; - if (subclass) + if (subclass) { + unsigned long flags; + + if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; register_lock_class(lock, subclass, 1); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + } } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -3885,9 +3908,17 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one sync_sched() before getting here, so we're guaranteed + * nobody will look up these exact classes -- they're properly dead but still + * allocated. + */ void lockdep_free_key_range(void *start, unsigned long size) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i; @@ -3903,7 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3914,11 +3945,25 @@ void lockdep_free_key_range(void *start, unsigned long size) if (locked) graph_unlock(); raw_local_irq_restore(flags); + + /* + * Wait for any possible iterators from look_up_lock_class() to pass + * before continuing to free the memory they refer to. + * + * sync_sched() is sufficient because the read-side is IRQ disable. + */ + synchronize_sched(); + + /* + * XXX at this point we could return the resources to the pool; + * instead we leak them. We would need to change to bitmap allocators + * instead of the linear allocators we have now. + */ } void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i, j; @@ -3946,7 +3991,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0955b885d0dc..ec8cce259779 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -20,30 +20,20 @@ * Author: Paul E. McKenney <paulmck@us.ibm.com> * Based on kernel/rcu/torture.c. */ -#include <linux/types.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> -#include <linux/err.h> #include <linux/spinlock.h> +#include <linux/rwlock.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> #include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/freezer.h> -#include <linux/cpu.h> #include <linux/delay.h> -#include <linux/stat.h> #include <linux/slab.h> -#include <linux/trace_clock.h> -#include <asm/byteorder.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, + "Number of read-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); @@ -66,30 +58,28 @@ torture_param(bool, verbose, true, static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, - "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); - -static atomic_t n_lock_torture_errors; + "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); static struct task_struct *stats_task; static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; -static int nrealwriters_stress; static bool lock_is_write_held; +static bool lock_is_read_held; -struct lock_writer_stress_stats { - long n_write_lock_fail; - long n_write_lock_acquired; +struct lock_stress_stats { + long n_lock_fail; + long n_lock_acquired; }; -static struct lock_writer_stress_stats *lwsa; #if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif -int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; -module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); +int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); @@ -102,12 +92,25 @@ struct lock_torture_ops { int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); void (*writeunlock)(void); + int (*readlock)(void); + void (*read_delay)(struct torture_random_state *trsp); + void (*readunlock)(void); unsigned long flags; const char *name; }; -static struct lock_torture_ops *cur_ops; - +struct lock_torture_cxt { + int nrealwriters_stress; + int nrealreaders_stress; + bool debug_lock; + atomic_t n_lock_torture_errors; + struct lock_torture_ops *cur_ops; + struct lock_stress_stats *lwsa; /* writer statistics */ + struct lock_stress_stats *lrsa; /* reader statistics */ +}; +static struct lock_torture_cxt cxt = { 0, 0, false, + ATOMIC_INIT(0), + NULL, NULL}; /* * Definitions for lock torture testing. */ @@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, .writeunlock = torture_lock_busted_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "lock_busted" }; @@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) * we want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); if (!(torture_random(trsp) % - (nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock" }; static int torture_spin_lock_write_lock_irq(void) -__acquires(torture_spinlock_irq) +__acquires(torture_spinlock) { unsigned long flags; spin_lock_irqsave(&torture_spinlock, flags); - cur_ops->flags = flags; + cxt.cur_ops->flags = flags; return 0; } static void torture_lock_spin_write_unlock_irq(void) __releases(torture_spinlock) { - spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); + spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); } static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_lock_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock_irq" }; +static DEFINE_RWLOCK(torture_rwlock); + +static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +{ + write_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +{ + write_unlock(&torture_rwlock); +} + +static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +{ + read_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_read_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 10; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +{ + read_unlock(&torture_rwlock); +} + +static struct lock_torture_ops rw_lock_ops = { + .writelock = torture_rwlock_write_lock, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock, + .readlock = torture_rwlock_read_lock, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock, + .name = "rw_lock" +}; + +static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + write_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_write_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + read_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_read_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops rw_lock_irq_ops = { + .writelock = torture_rwlock_write_lock_irq, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock_irq, + .readlock = torture_rwlock_read_lock_irq, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock_irq, + .name = "rw_lock_irq" +}; + +static DEFINE_MUTEX(torture_mutex); + +static int torture_mutex_lock(void) __acquires(torture_mutex) +{ + mutex_lock(&torture_mutex); + return 0; +} + +static void torture_mutex_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 5); + else + mdelay(longdelay_ms / 5); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_mutex_unlock(void) __releases(torture_mutex) +{ + mutex_unlock(&torture_mutex); +} + +static struct lock_torture_ops mutex_lock_ops = { + .writelock = torture_mutex_lock, + .write_delay = torture_mutex_delay, + .writeunlock = torture_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "mutex_lock" +}; + +static DECLARE_RWSEM(torture_rwsem); +static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +{ + down_write(&torture_rwsem); + return 0; +} + +static void torture_rwsem_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 10); + else + mdelay(longdelay_ms / 10); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_write(void) __releases(torture_rwsem) +{ + up_write(&torture_rwsem); +} + +static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +{ + down_read(&torture_rwsem); + return 0; +} + +static void torture_rwsem_read_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 2); + else + mdelay(longdelay_ms / 2); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_read(void) __releases(torture_rwsem) +{ + up_read(&torture_rwsem); +} + +static struct lock_torture_ops rwsem_lock_ops = { + .writelock = torture_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .writeunlock = torture_rwsem_up_write, + .readlock = torture_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_rwsem_up_read, + .name = "rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. */ static int lock_torture_writer(void *arg) { - struct lock_writer_stress_stats *lwsp = arg; + struct lock_stress_stats *lwsp = arg; static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); @@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cur_ops->writelock(); + + cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_write_lock_fail++; + lwsp->n_lock_fail++; lock_is_write_held = 1; - lwsp->n_write_lock_acquired++; - cur_ops->write_delay(&rand); + if (WARN_ON_ONCE(lock_is_read_held)) + lwsp->n_lock_fail++; /* rare, but... */ + + lwsp->n_lock_acquired++; + cxt.cur_ops->write_delay(&rand); lock_is_write_held = 0; - cur_ops->writeunlock(); + cxt.cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_writer"); @@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg) } /* + * Lock torture reader kthread. Repeatedly acquires and releases + * the reader lock. + */ +static int lock_torture_reader(void *arg) +{ + struct lock_stress_stats *lrsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_reader task started"); + set_user_nice(current, MAX_NICE); + + do { + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); + + cxt.cur_ops->readlock(); + lock_is_read_held = 1; + if (WARN_ON_ONCE(lock_is_write_held)) + lrsp->n_lock_fail++; /* rare, but... */ + + lrsp->n_lock_acquired++; + cxt.cur_ops->read_delay(&rand); + lock_is_read_held = 0; + cxt.cur_ops->readunlock(); + + stutter_wait("lock_torture_reader"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_reader"); + return 0; +} + +/* * Create an lock-torture-statistics message in the specified buffer. */ -static void lock_torture_printk(char *page) +static void __torture_print_stats(char *page, + struct lock_stress_stats *statp, bool write) { bool fail = 0; - int i; + int i, n_stress; long max = 0; - long min = lwsa[0].n_write_lock_acquired; + long min = statp[0].n_lock_acquired; long long sum = 0; - for (i = 0; i < nrealwriters_stress; i++) { - if (lwsa[i].n_write_lock_fail) + n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; + for (i = 0; i < n_stress; i++) { + if (statp[i].n_lock_fail) fail = true; - sum += lwsa[i].n_write_lock_acquired; - if (max < lwsa[i].n_write_lock_fail) - max = lwsa[i].n_write_lock_fail; - if (min > lwsa[i].n_write_lock_fail) - min = lwsa[i].n_write_lock_fail; + sum += statp[i].n_lock_acquired; + if (max < statp[i].n_lock_fail) + max = statp[i].n_lock_fail; + if (min > statp[i].n_lock_fail) + min = statp[i].n_lock_fail; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); page += sprintf(page, - "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + write ? "Writes" : "Reads ", sum, max, min, max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) - atomic_inc(&n_lock_torture_errors); + atomic_inc(&cxt.n_lock_torture_errors); } /* @@ -274,18 +533,35 @@ static void lock_torture_printk(char *page) */ static void lock_torture_stats_print(void) { - int size = nrealwriters_stress * 200 + 8192; + int size = cxt.nrealwriters_stress * 200 + 8192; char *buf; + if (cxt.cur_ops->readlock) + size += cxt.nrealreaders_stress * 200 + 8192; + buf = kmalloc(size, GFP_KERNEL); if (!buf) { pr_err("lock_torture_stats_print: Out of memory, need: %d", size); return; } - lock_torture_printk(buf); + + __torture_print_stats(buf, cxt.lwsa, true); pr_alert("%s", buf); kfree(buf); + + if (cxt.cur_ops->readlock) { + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + + __torture_print_stats(buf, cxt.lrsa, false); + pr_alert("%s", buf); + kfree(buf); + } } /* @@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealwriters_stress, stat_interval, verbose, - shuffle_interval, stutter, shutdown_secs, + "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, cxt.debug_lock ? " [debug]": "", + cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, + verbose, shuffle_interval, stutter, shutdown_secs, onoff_interval, onoff_holdoff); } @@ -322,46 +599,59 @@ static void lock_torture_cleanup(void) { int i; - if (torture_cleanup()) + if (torture_cleanup_begin()) return; if (writer_tasks) { - for (i = 0; i < nrealwriters_stress; i++) + for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } + if (reader_tasks) { + for (i = 0; i < cxt.nrealreaders_stress; i++) + torture_stop_kthread(lock_torture_reader, + reader_tasks[i]); + kfree(reader_tasks); + reader_tasks = NULL; + } + torture_stop_kthread(lock_torture_stats, stats_task); lock_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (atomic_read(&n_lock_torture_errors)) - lock_torture_print_module_parms(cur_ops, + if (atomic_read(&cxt.n_lock_torture_errors)) + lock_torture_print_module_parms(cxt.cur_ops, "End of test: FAILURE"); else if (torture_onoff_failures()) - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: LOCK_HOTPLUG"); else - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } static int __init lock_torture_init(void) { - int i; + int i, j; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &lock_busted_ops, + &spin_lock_ops, &spin_lock_irq_ops, + &rw_lock_ops, &rw_lock_irq_ops, + &mutex_lock_ops, + &rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) + cxt.cur_ops = torture_ops[i]; + if (strcmp(torture_type, cxt.cur_ops->name) == 0) break; } if (i == ARRAY_SIZE(torture_ops)) { @@ -374,31 +664,69 @@ static int __init lock_torture_init(void) torture_init_end(); return -EINVAL; } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (cxt.cur_ops->init) + cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nwriters_stress >= 0) - nrealwriters_stress = nwriters_stress; + cxt.nrealwriters_stress = nwriters_stress; else - nrealwriters_stress = 2 * num_online_cpus(); - lock_torture_print_module_parms(cur_ops, "Start of test"); + cxt.nrealwriters_stress = 2 * num_online_cpus(); + +#ifdef CONFIG_DEBUG_MUTEXES + if (strncmp(torture_type, "mutex", 5) == 0) + cxt.debug_lock = true; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + if ((strncmp(torture_type, "spin", 4) == 0) || + (strncmp(torture_type, "rw_lock", 7) == 0)) + cxt.debug_lock = true; +#endif /* Initialize the statistics so that each run gets its own numbers. */ lock_is_write_held = 0; - lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); - if (lwsa == NULL) { - VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { - lwsa[i].n_write_lock_fail = 0; - lwsa[i].n_write_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; } - /* Start up the kthreads. */ + if (cxt.cur_ops->readlock) { + if (nreaders_stress >= 0) + cxt.nrealreaders_stress = nreaders_stress; + else { + /* + * By default distribute evenly the number of + * readers and writers. We still run the same number + * of threads as the writer-only locks default. + */ + if (nwriters_stress < 0) /* user doesn't care */ + cxt.nrealwriters_stress = num_online_cpus(); + cxt.nrealreaders_stress = cxt.nrealwriters_stress; + } + + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } + } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); + /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); @@ -422,18 +750,51 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { - firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + + if (cxt.cur_ops->readlock) { + reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } + + /* + * Create the kthreads and start torturing (oh, those poor little locks). + * + * TODO: Note that we interleave writers with readers, giving writers a + * slight advantage, by creating its kthread first. This can be modified + * for very specific needs, or even let the user choose the policy, if + * ever wanted. + */ + for (i = 0, j = 0; i < cxt.nrealwriters_stress || + j < cxt.nrealreaders_stress; i++, j++) { + if (i >= cxt.nrealwriters_stress) + goto create_reader; + + /* Create writer. */ + firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], writer_tasks[i]); if (firsterr) goto unwind; + + create_reader: + if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) + continue; + /* Create reader. */ + firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], + reader_tasks[j]); + if (firsterr) + goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 74356dc0ce29..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -27,7 +27,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - arch_mutex_cpu_relax(); \ + cpu_relax_lowlatency(); \ } while (0) #endif @@ -56,9 +56,6 @@ do { \ * If the lock has already been acquired, then this will proceed to spin * on this node->locked until the previous lock holder sets the node->locked * in mcs_spin_unlock(). - * - * We don't inline mcs_spin_lock() so that perf can correctly account for the - * time spent in this lock function. */ static inline void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) @@ -81,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) */ return; } - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* Wait until the lock holder passes the lock down. */ arch_mcs_spin_lock_contended(&node->locked); @@ -94,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) static inline void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) { - struct mcs_spinlock *next = ACCESS_ONCE(node->next); + struct mcs_spinlock *next = READ_ONCE(node->next); if (likely(!next)) { /* @@ -103,28 +100,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) if (likely(cmpxchg(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); + while (!(next = READ_ONCE(node->next))) + cpu_relax_lowlatency(); } /* Pass lock to next waiter. */ arch_mcs_spin_unlock_contended(&next->locked); } -/* - * Cancellable version of the MCS lock above. - * - * Intended for adaptive spinning of sleeping locks: - * mutex_lock()/rwsem_down_{read,write}() etc. - */ - -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # value */ -}; - -extern bool osq_lock(struct optimistic_spin_queue *lock); -extern void osq_unlock(struct optimistic_spin_queue *lock); - #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); } /* * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug * mutexes so that we can do it here after we've verified state. */ + mutex_clear_owner(lock); atomic_set(&lock->count, 1); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index acca2c1a3c5e..4cccea6b8934 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -15,7 +15,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #include <linux/mutex.h> #include <linux/ww_mutex.h> @@ -25,7 +25,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include "mcs_spinlock.h" +#include <linux/osq_lock.h> /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -46,12 +46,6 @@ # include <asm/mutex.h> #endif -/* - * A negative mutex count indicates that waiters are sleeping waiting for the - * mutex. - */ -#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) - void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -87,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); * The mutex must later on be released by the same task that * acquired it. Recursive locking is not allowed. The task * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with + * memory where the mutex resides must not be freed with * the mutex still locked. The mutex must first be initialized * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. @@ -112,56 +106,146 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + /* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. + * After acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} /* - * Mutex spinning code migrated from kernel/sched/core.c + * After acquiring lock in the slowpath set ctx and wake up any + * waiters so they can recheck. + * + * Callers must hold the mutex wait_lock. */ - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +static __always_inline void +ww_mutex_set_context_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) { - if (lock->owner != owner) - return false; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + lock->ctx = ctx; /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. */ - barrier(); - - return owner->on_cpu; + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } } +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. */ static noinline -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) { + bool ret = true; + rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) + while (lock->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner->on_cpu || need_resched()) { + ret = false; break; + } - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } rcu_read_unlock(); - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; + return ret; } /* @@ -176,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = ACCESS_ONCE(lock->owner); + owner = READ_ONCE(lock->owner); if (owner) retval = owner->on_cpu; rcu_read_unlock(); @@ -186,6 +270,140 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ return retval; } + +/* + * Atomically try to take the lock when it is available + */ +static inline bool mutex_try_to_acquire(struct mutex *lock) +{ + return !mutex_is_locked(lock) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1); +} + +/* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that the lock owner + * is currently running on a (different) CPU and while we don't + * need to reschedule. The rationale is that if the lock owner is + * running, it is likely to release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. + * + * Returns true when the lock was taken, otherwise false, indicating + * that we need to jump to the slowpath and sleep. + */ +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + struct task_struct *task = current; + + if (!mutex_can_spin_on_owner(lock)) + goto done; + + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto done; + + while (true) { + struct task_struct *owner; + + if (use_ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (READ_ONCE(ww->ctx)) + break; + } + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = READ_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + /* Try to acquire the mutex if it is unlocked. */ + if (mutex_try_to_acquire(lock)) { + lock_acquired(&lock->dep_map, ip); + + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + + mutex_set_owner(lock); + osq_unlock(&lock->osq); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax_lowlatency(); + } + + osq_unlock(&lock->osq); +done: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) { + /* + * We _should_ have TASK_RUNNING here, but just in case + * we do not, make it so, otherwise we might get stuck. + */ + __set_current_state(TASK_RUNNING); + schedule_preempt_disabled(); + } + + return false; +} +#else +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + return false; +} #endif __visible __used noinline @@ -260,10 +478,10 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); if (!hold_ctx) return 0; @@ -283,91 +501,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) return 0; } -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -/* - * after acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - unsigned long flags; - struct mutex_waiter *cur; - - ww_mutex_lock_acquired(lock, ctx); - - lock->ctx = ctx; - - /* - * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* ^^^ */ - - /* - * Check if lock is contended, if not there is nobody to wake up - */ - if (likely(atomic_read(&lock->base.count) == 0)) - return; - - /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. - */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); -} - /* * Lock a mutex (possibly interruptible), slowpath: */ @@ -384,109 +517,19 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - /* - * Optimistic spinning. - * - * We try to spin for acquisition when we find that there are no - * pending waiters and the lock owner is currently running on a - * (different) CPU. - * - * The rationale is that if the lock owner is running, it is likely to - * release the lock soon. - * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * - * The mutex spinners are queued up using MCS lock so that only one - * spinner can compete for the mutex. However, if mutex spinning isn't - * going to happen, there is no point in going through the lock/unlock - * overhead. - */ - if (!mutex_can_spin_on_owner(lock)) - goto slowpath; - - if (!osq_lock(&lock->osq)) - goto slowpath; - - for (;;) { - struct task_struct *owner; - - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (ACCESS_ONCE(ww->ctx)) - break; - } - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - if ((atomic_read(&lock->count) == 1) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { - lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); - osq_unlock(&lock->osq); - preempt_enable(); - return 0; - } + if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + /* got the lock, yay! */ + preempt_enable(); + return 0; + } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; + spin_lock_mutex(&lock->wait_lock, flags); - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - arch_mutex_cpu_relax(); - } - osq_unlock(&lock->osq); -slowpath: /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock the mutex. This avoids getting - * scheduled out right after we obtained the mutex. + * Once more, try to acquire the lock. Only try-lock the mutex if + * it is unlocked to reduce unnecessary xchg() operations. */ - if (need_resched()) - schedule_preempt_disabled(); -#endif - spin_lock_mutex(&lock->wait_lock, flags); - - /* once more, can we acquire the lock? */ - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -506,9 +549,10 @@ slowpath: * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the - * other waiters: + * other waiters. We only attempt the xchg if the count is + * non-negative in order to avoid unnecessary xchg operations: */ - if (MUTEX_SHOW_NO_WAITER(lock) && + if (atomic_read(&lock->count) >= 0 && (atomic_xchg(&lock->count, -1) == 1)) break; @@ -522,7 +566,7 @@ slowpath: } if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; } @@ -534,6 +578,8 @@ slowpath: schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + __set_task_state(task, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current_thread_info()); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -547,23 +593,7 @@ skip_wait: if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + ww_mutex_set_context_slowpath(ww, ww_ctx); } spin_unlock_mutex(&lock->wait_lock, flags); @@ -682,15 +712,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +__mutex_unlock_common_slowpath(struct mutex *lock, int nested) { - struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; /* - * some architectures leave the lock unlocked in the fastpath failure + * As a performance measurement, release the lock before doing other + * wakeup related duties to follow. This allows other tasks to acquire + * the lock sooner, while still handling cleanups in past unlock calls. + * This can be done as we do not enforce strict equivalence between the + * mutex counter and wait_list. + * + * + * Some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to - * unlock it here + * unlock it here - as the lock counter is currently 0 or negative. */ if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); @@ -719,7 +755,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) __visible void __mutex_unlock_slowpath(atomic_t *lock_count) { - __mutex_unlock_common_slowpath(lock_count, 1); + struct mutex *lock = container_of(lock_count, struct mutex, count); + + __mutex_unlock_common_slowpath(lock, 1); } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -823,6 +861,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) unsigned long flags; int prev; + /* No need to trylock if the mutex is locked. */ + if (mutex_is_locked(lock)) + return 0; + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,7 +16,7 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline void mutex_set_owner(struct mutex *lock) { lock->owner = current; diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c index be9ee1559fca..dc85ee23a26f 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/osq_lock.c @@ -1,10 +1,6 @@ - #include <linux/percpu.h> -#include <linux/mutex.h> #include <linux/sched.h> -#include "mcs_spinlock.h" - -#ifdef CONFIG_SMP +#include <linux/osq_lock.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -79,7 +75,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return next; @@ -102,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* * Normally @prev is untouchable after the above store; because at that @@ -113,14 +109,14 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!smp_load_acquire(&node->locked)) { + while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ if (need_resched()) goto unqueue; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } return true; @@ -146,13 +142,13 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); /* * Or we race against a concurrent unqueue()'s step-B, in which * case its step-C will write us a new @node->prev pointer. */ - prev = ACCESS_ONCE(node->prev); + prev = READ_ONCE(node->prev); } /* @@ -174,8 +170,8 @@ unqueue: * it will wait in Step-A. */ - ACCESS_ONCE(next->prev) = prev; - ACCESS_ONCE(prev->next) = next; + WRITE_ONCE(next->prev, prev); + WRITE_ONCE(prev->next, next); return false; } @@ -197,14 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) node = this_cpu_ptr(&osq_node); next = xchg(&node->next, NULL); if (next) { - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); return; } next = osq_wait_next(lock, node, NULL); if (next) - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); } - -#endif - diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fb5b8ac411a5..f956ede7f90d 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,7 +20,6 @@ #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/mutex.h> #include <asm/qrwlock.h> /** @@ -35,7 +34,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = smp_load_acquire((u32 *)&lock->cnts); } } @@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock) * to make sure that the write lock isn't taken. */ while (atomic_read(&lock->cnts) & _QW_WMASK) - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); @@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) cnts | _QW_WAITING) == cnts)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } /* When no more readers, set the locked flag */ @@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } unlock: arch_spin_unlock(&lock->lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 49b2ed3dced8..62b6cee8ea7f 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task) * the deadlock. We print when we return. act_waiter can be NULL in * case of a remove waiter operation. */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, +void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *act_waiter, struct rt_mutex *lock) { struct task_struct *task; - if (!debug_locks || detect || !act_waiter) + if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index ab29b6a22669..d0519c3432b6 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, +extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); # define debug_rt_mutex_reset_waiter(w) \ do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk walk) { return (waiter != NULL); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fc605941b9b8..b73279367087 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,7 +8,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/export.h> @@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task) } /* + * Deadlock detection is conditional: + * + * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted + * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. + * + * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always + * conducted independent of the detect argument. + * + * If the waiter argument is NULL this indicates the deboost path and + * deadlock detection is disabled independent of the detect argument + * and the config settings. + */ +static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) +{ + /* + * This is just a wrapper function for the following call, + * because debug_rt_mutex_detect_deadlock() smells like a magic + * debug feature and I wanted to keep the cond function in the + * main source file along with the comments instead of having + * two of the same in the headers. + */ + return debug_rt_mutex_detect_deadlock(waiter, chwalk); +} + +/* * Max number of times we'll walk the boosting chain: */ int max_lock_depth = 1024; @@ -323,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * * @task: the task owning the mutex (owner) for which a chain walk is * probably needed - * @deadlock_detect: do we have to carry out deadlock detection? + * @chwalk: do we have to carry out deadlock detection? * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck * things for a task that has just got its priority adjusted, and * is waiting on a mutex) @@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * @top_task: the current top waiter * * Returns 0 or -EDEADLK. + * + * Chain walk basics and protection scope + * + * [R] refcount on task + * [P] task->pi_lock held + * [L] rtmutex->wait_lock held + * + * Step Description Protected by + * function arguments: + * @task [R] + * @orig_lock if != NULL @top_task is blocked on it + * @next_lock Unprotected. Cannot be + * dereferenced. Only used for + * comparison. + * @orig_waiter if != NULL @top_task is blocked on it + * @top_task current, or in case of proxy + * locking protected by calling + * code + * again: + * loop_sanity_check(); + * retry: + * [1] lock(task->pi_lock); [R] acquire [P] + * [2] waiter = task->pi_blocked_on; [P] + * [3] check_exit_conditions_1(); [P] + * [4] lock = waiter->lock; [P] + * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] + * unlock(task->pi_lock); release [P] + * goto retry; + * } + * [6] check_exit_conditions_2(); [P] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P] + [L] + * [8] unlock(task->pi_lock); release [P] + * put_task_struct(task); release [R] + * [9] check_exit_conditions_3(); [L] + * [10] task = owner(lock); [L] + * get_task_struct(task); [L] acquire [R] + * lock(task->pi_lock); [L] acquire [P] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] + * [12] check_exit_conditions_4(); [P] + [L] + * [13] unlock(task->pi_lock); release [P] + * unlock(lock->wait_lock); release [L] + * goto again; */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, + enum rtmutex_chainwalk chwalk, struct rt_mutex *orig_lock, struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { - struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; + struct rt_mutex_waiter *prerequeue_top_waiter; + int ret = 0, depth = 0; + struct rt_mutex *lock; + bool detect_deadlock; unsigned long flags; + bool requeue = true; - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); + detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); /* * The (de)boosting is a step by step approach with a lot of @@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * carefully whether things change under us. */ again: + /* + * We limit the lock chain length for each invocation. + */ if (++depth > max_lock_depth) { static int prev_max; @@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return -EDEADLK; } + + /* + * We are fully preemptible here and only hold the refcount on + * @task. So everything can have changed under us since the + * caller or our own code below (goto retry/again) dropped all + * locks. + */ retry: /* - * Task can not go away as we did a get_task() before ! + * [1] Task cannot go away as we did a get_task() before ! */ raw_spin_lock_irqsave(&task->pi_lock, flags); + /* + * [2] Get the waiter on which @task is blocked on. + */ waiter = task->pi_blocked_on; + + /* + * [3] check_exit_conditions_1() protected by task->pi_lock. + */ + /* * Check whether the end of the boosting chain has been * reached or the state of the chain has changed while we @@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* * If deadlock detection is off, we stop here if we - * are not the top pi waiter of the task. + * are not the top pi waiter of the task. If deadlock + * detection is enabled we continue, but stop the + * requeueing in the chain walk. */ - if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) - goto out_unlock_pi; + if (top_waiter != task_top_pi_waiter(task)) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } } /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. + * If the waiter priority is the same as the task priority + * then there is no further priority adjustment necessary. If + * deadlock detection is off, we stop the chain walk. If its + * enabled we continue, but stop the requeueing in the chain + * walk. */ - if (!detect_deadlock && waiter->prio == task->prio) - goto out_unlock_pi; + if (waiter->prio == task->prio) { + if (!detect_deadlock) + goto out_unlock_pi; + else + requeue = false; + } + /* + * [4] Get the next lock + */ lock = waiter->lock; + /* + * [5] We need to trylock here as we are holding task->pi_lock, + * which is the reverse lock order versus the other rtmutex + * operations. + */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); @@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* + * [6] check_exit_conditions_2() protected by task->pi_lock and + * lock->wait_lock. + * * Deadlock detection. If the lock is the same as the original * lock which caused us to walk the lock chain or if the * current lock is owned by the task which initiated the chain * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; goto out_unlock_pi; } - top_waiter = rt_mutex_top_waiter(lock); + /* + * If we just follow the lock chain for deadlock detection, no + * need to do all the requeue operations. To avoid a truckload + * of conditionals around the various places below, just do the + * minimum chain walk checks. + */ + if (!requeue) { + /* + * No requeue[7] here. Just release @task [8] + */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * If there is no owner of the lock, end of chain. + */ + if (!rt_mutex_owner(lock)) { + raw_spin_unlock(&lock->wait_lock); + return 0; + } + + /* [10] Grab the next task, i.e. owner of @lock */ + task = rt_mutex_owner(lock); + get_task_struct(task); + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* + * No requeue [11] here. We just do deadlock detection. + * + * [12] Store whether owner is blocked + * itself. Decision is made after dropping the locks + */ + next_lock = task_blocked_on_lock(task); + /* + * Get the top waiter for the next iteration + */ + top_waiter = rt_mutex_top_waiter(lock); + + /* [13] Drop locks */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&lock->wait_lock); + + /* If owner is not blocked, end of chain. */ + if (!next_lock) + goto out_put_task; + goto again; + } - /* Requeue the waiter */ + /* + * Store the current top waiter before doing the requeue + * operation on @lock. We need it for the boost/deboost + * decision below. + */ + prerequeue_top_waiter = rt_mutex_top_waiter(lock); + + /* [7] Requeue the waiter in the lock waiter list. */ rt_mutex_dequeue(lock, waiter); waiter->prio = task->prio; rt_mutex_enqueue(lock, waiter); - /* Release the task */ + /* [8] Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* + * [9] check_exit_conditions_3 protected by lock->wait_lock. + * + * We must abort the chain walk if there is no lock owner even + * in the dead lock detection case, as we have nothing to + * follow here. This is the end of the chain we are walking. + */ if (!rt_mutex_owner(lock)) { /* - * If the requeue above changed the top waiter, then we need - * to wake the new top waiter up to try to get the lock. + * If the requeue [7] above changed the top waiter, + * then we need to wake the new top waiter up to try + * to get the lock. */ - - if (top_waiter != rt_mutex_top_waiter(lock)) + if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); raw_spin_unlock(&lock->wait_lock); - goto out_put_task; + return 0; } - put_task_struct(task); - /* Grab the next task */ + /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); raw_spin_lock_irqsave(&task->pi_lock, flags); + /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - rt_mutex_dequeue_pi(task, top_waiter); + /* + * The waiter became the new top (highest priority) + * waiter on the lock. Replace the previous top waiter + * in the owner tasks pi waiters list with this waiter + * and adjust the priority of the owner. + */ + rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); - } else if (top_waiter == waiter) { - /* Deboost the owner */ + } else if (prerequeue_top_waiter == waiter) { + /* + * The waiter was the top waiter on the lock, but is + * no longer the top prority waiter. Replace waiter in + * the owner tasks pi waiters list with the new top + * (highest priority) waiter and adjust the priority + * of the owner. + * The new top waiter is stored in @waiter so that + * @waiter == @top_waiter evaluates to true below and + * we continue to deboost the rest of the chain. + */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); + } else { + /* + * Nothing changed. No need to do any priority + * adjustment. + */ } /* + * [12] check_exit_conditions_4() protected by task->pi_lock + * and lock->wait_lock. The actual decisions are made after we + * dropped the locks. + * * Check whether the task which owns the current lock is pi * blocked itself. If yes we store a pointer to the lock for * the lock chain change detection above. After we dropped * task->pi_lock next_lock cannot be dereferenced anymore. */ next_lock = task_blocked_on_lock(task); + /* + * Store the top waiter of @lock for the end of chain walk + * decision below. + */ + top_waiter = rt_mutex_top_waiter(lock); + /* [13] Drop the locks */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); /* + * Make the actual exit decisions [12], based on the stored + * values. + * * We reached the end of the lock chain. Stop right here. No * point to go back just to figure that out. */ if (!next_lock) goto out_put_task; + /* + * If the current waiter is not the top waiter on the lock, + * then we can stop the chain walk here if we are not in full + * deadlock detection mode. + */ if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * * Must be called with lock->wait_lock held. * - * @lock: the lock to be acquired. - * @task: the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + * @lock: The lock to be acquired. + * @task: The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait list if the + * callsite called task_blocked_on_lock(), otherwise NULL */ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter) { + unsigned long flags; + /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * Before testing whether we can acquire @lock, we set the + * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all + * other tasks which try to modify @lock into the slow path + * and they serialize on @lock->wait_lock. + * + * The RT_MUTEX_HAS_WAITERS bit can have a transitional state + * as explained at the top of this file if and only if: * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. + * - There is a lock owner. The caller must fixup the + * transient state if it does a trylock or leaves the lock + * function due to a signal or timeout. * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. + * - @task acquires the lock and there are no other + * waiters. This is undone in rt_mutex_set_owner(@task) at + * the end of this function. */ mark_rt_mutex_waiters(lock); + /* + * If @lock has an owner, give up. + */ if (rt_mutex_owner(lock)) return 0; /* - * It will get the lock because of one of these conditions: - * 1) there is no waiter - * 2) higher priority than waiters - * 3) it is top waiter + * If @waiter != NULL, @task has already enqueued the waiter + * into @lock waiter list. If @waiter == NULL then this is a + * trylock attempt. */ - if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->prio) { - if (!waiter || waiter != rt_mutex_top_waiter(lock)) - return 0; - } - } - - if (waiter || rt_mutex_has_waiters(lock)) { - unsigned long flags; - struct rt_mutex_waiter *top; - - raw_spin_lock_irqsave(&task->pi_lock, flags); + if (waiter) { + /* + * If waiter is not the highest priority waiter of + * @lock, give up. + */ + if (waiter != rt_mutex_top_waiter(lock)) + return 0; - /* remove the queued waiter. */ - if (waiter) { - rt_mutex_dequeue(lock, waiter); - task->pi_blocked_on = NULL; - } + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters list. + */ + rt_mutex_dequeue(lock, waiter); + } else { /* - * We have to enqueue the top waiter(if it exists) into - * task->pi_waiters list. + * If the lock has waiters already we check whether @task is + * eligible to take over the lock. + * + * If there are no other waiters, @task can acquire + * the lock. @task->pi_blocked_on is NULL, so it does + * not need to be dequeued. */ if (rt_mutex_has_waiters(lock)) { - top = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(task, top); + /* + * If @task->prio is greater than or equal to + * the top waiter priority (kernel view), + * @task lost. + */ + if (task->prio >= rt_mutex_top_waiter(lock)->prio) + return 0; + + /* + * The current top waiter stays enqueued. We + * don't have to change anything in the lock + * waiters order. + */ + } else { + /* + * No waiters. Take the lock without the + * pi_lock dance.@task->pi_blocked_on is NULL + * and we have no waiters to enqueue in @task + * pi waiters list. + */ + goto takeit; } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); } + /* + * Clear @task->pi_blocked_on. Requires protection by + * @task->pi_lock. Redundant operation for the @waiter == NULL + * case, but conditionals are more expensive than a redundant + * store. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + task->pi_blocked_on = NULL; + /* + * Finish the lock acquisition. @task is the new owner. If + * other waiters exist we have to insert the highest priority + * waiter into @task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +takeit: /* We got the lock. */ debug_rt_mutex_lock(lock); + /* + * This either preserves the RT_MUTEX_HAS_WAITERS bit if there + * are still waiters or clears it. + */ rt_mutex_set_owner(lock, task); rt_mutex_deadlock_account_lock(lock, task); @@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { chain_walk = 1; } @@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock) static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - int first = (waiter == rt_mutex_top_waiter(lock)); + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock = NULL; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); @@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock, current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (!owner) + /* + * Only update priority if the waiter was the highest priority + * waiter of the lock and there is an owner to update. + */ + if (!owner || !is_top_waiter) return; - if (first) { - - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); - rt_mutex_dequeue_pi(owner, waiter); + rt_mutex_dequeue_pi(owner, waiter); - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; + if (rt_mutex_has_waiters(lock)) + rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - next = rt_mutex_top_waiter(lock); - rt_mutex_enqueue_pi(owner, next); - } - __rt_mutex_adjust_prio(owner); + __rt_mutex_adjust_prio(owner); - /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Don't walk the chain, if the owner task is not blocked + * itself. + */ if (!next_lock) return; @@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, + next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); } /** @@ -873,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); } + __set_current_state(TASK_RUNNING); return ret; } @@ -902,7 +1160,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock) + enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; int ret = 0; @@ -928,16 +1186,17 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) { - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + __set_current_state(TASK_RUNNING); + if (rt_mutex_has_waiters(lock)) + remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, chwalk, &waiter); } /* @@ -960,22 +1219,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* * Slow path try-lock function: */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { - int ret = 0; + int ret; + + /* + * If the lock already has an owner we fail to get the lock. + * This can be done without taking the @lock->wait_lock as + * it is only being read, and this is a trylock anyway. + */ + if (rt_mutex_owner(lock)) + return 0; + /* + * The mutex has currently no owner. Lock the wait lock and + * try to acquire the lock. + */ raw_spin_lock(&lock->wait_lock); - if (likely(rt_mutex_owner(lock) != current)) { + ret = try_to_take_rt_mutex(lock, current, NULL); - ret = try_to_take_rt_mutex(lock, current, NULL); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock(&lock->wait_lock); @@ -1053,30 +1321,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock) */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, NULL, detect_deadlock); + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - int detect_deadlock)) + enum rtmutex_chainwalk chwalk)) { - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (chwalk == RT_MUTEX_MIN_CHAINWALK && + likely(rt_mutex_cmpxchg(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else - return slowfn(lock, state, timeout, detect_deadlock); + return slowfn(lock, state, timeout, chwalk); } static inline int @@ -1109,54 +1378,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off + * @lock: the rt_mutex to be locked * * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) + * 0 on success + * -EINTR when interrupted by a signal */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/* + * Futex variant with full deadlock detection. + */ +int rt_mutex_timed_futex_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + RT_MUTEX_FULL_CHAINWALK, + rt_mutex_slowlock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided * by the caller * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off * * Returns: - * 0 on success - * -EINTR when interrupted by a signal + * 0 on success + * -EINTR when interrupted by a signal * -ETIMEDOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) */ int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { might_sleep(); return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); + RT_MUTEX_MIN_CHAINWALK, + rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1262,7 +1538,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, * @lock: the rt_mutex to take * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Returns: * 0 - task blocked on lock @@ -1273,7 +1548,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, int detect_deadlock) + struct task_struct *task) { int ret; @@ -1285,7 +1560,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, } /* We enforce deadlock detection for futexes */ - ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); if (ret && !rt_mutex_owner(lock)) { /* @@ -1331,22 +1607,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) * rt_mutex_finish_proxy_lock() - Complete lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have - * been started. + * been started. * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: perform deadlock detection (1) or not (0) * * Complete the lock acquisition started our behalf by another thread. * * Returns: * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * <0 - error, one of -EINTR, -ETIMEDOUT * * Special API call for PI-futex requeue support */ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret; @@ -1354,10 +1628,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) remove_waiter(lock, waiter); diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index f6a1f3c133b1..c4060584c407 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -22,10 +22,15 @@ #define debug_rt_mutex_init(m, n) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) { WARN(1, "rtmutex deadlock detected\n"); } + +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, + enum rtmutex_chainwalk walk) +{ + return walk == RT_MUTEX_FULL_CHAINWALK; +} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7431a9c86f35..855212501407 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) } /* + * Constants for rt mutex functions which have a selectable deadlock + * detection. + * + * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are + * no further PI adjustments to be made. + * + * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full + * walk of the lock chain. + */ +enum rtmutex_chainwalk { + RT_MUTEX_MIN_CHAINWALK, + RT_MUTEX_FULL_CHAINWALK, +}; + +/* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock); + struct task_struct *task); extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock); + struct rt_mutex_waiter *waiter); +extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2c93571162cb..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) list_del(&waiter->list); tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); @@ -154,7 +161,7 @@ void __sched __down_read(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); } - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); out: ; } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a2391ac135c8..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -14,8 +14,9 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/sched/rt.h> +#include <linux/osq_lock.h> -#include "mcs_spinlock.h" +#include "rwsem.h" /* * Guide to the rw_semaphore's count field for common values. @@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); @@ -242,23 +250,26 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) schedule(); } - tsk->state = TASK_RUNNING; - + __set_task_state(tsk, TASK_RUNNING); return sem; } +EXPORT_SYMBOL(rwsem_down_read_failed); static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* try acquiring the write lock */ - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); - return true; - } + /* + * Try acquiring the write lock. Check count first in order + * to reduce unnecessary expensive cmpxchg() operations. + */ + if (count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + rwsem_set_owner(sem); + return true; } + return false; } @@ -268,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = ACCESS_ONCE(sem->count); + long old, count = READ_ONCE(sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) + if (old == count) { + rwsem_set_owner(sem); return true; + } count = old; } @@ -285,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; - bool on_cpu = false; + bool ret = true; if (need_resched()) return false; rcu_read_lock(); - owner = ACCESS_ONCE(sem->owner); - if (owner) - on_cpu = owner->on_cpu; - rcu_read_unlock(); - - /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath, then there is a possibility reader(s) may have the lock. - * To be safe, avoid spinning in these situations. - */ - return on_cpu; -} - -static inline bool owner_running(struct rw_semaphore *sem, - struct task_struct *owner) -{ - if (sem->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * sem->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); + owner = READ_ONCE(sem->owner); + if (!owner) { + long count = READ_ONCE(sem->count); + /* + * If sem->owner is not set, yet we have just recently entered the + * slowpath with the lock being active, then there is a possibility + * reader(s) may have the lock. To be safe, bail spinning in these + * situations. + */ + if (count & RWSEM_ACTIVE_MASK) + ret = false; + goto done; + } - return owner->on_cpu; + ret = owner->on_cpu; +done: + rcu_read_unlock(); + return ret; } static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { + long count; + rcu_read_lock(); - while (owner_running(sem, owner)) { - if (need_resched()) - break; + while (sem->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); - arch_mutex_cpu_relax(); + /* abort spinning when need_resched or owner is not running */ + if (!owner->on_cpu || need_resched()) { + rcu_read_unlock(); + return false; + } + + cpu_relax_lowlatency(); } rcu_read_unlock(); + if (READ_ONCE(sem->owner)) + return true; /* new owner, continue spinning */ + /* - * We break out the loop above on need_resched() or when the - * owner changed, which is a sign for heavy contention. Return - * success only when sem->owner is NULL. + * When the owner is not set, the lock could be free or + * held by readers. Check the counter to verify the + * state. */ - return sem->owner == NULL; + count = READ_ONCE(sem->count); + return (count == 0 || count == RWSEM_WAITING_BIAS); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -356,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) goto done; while (true) { - owner = ACCESS_ONCE(sem->owner); + owner = READ_ONCE(sem->owner); if (owner && !rwsem_spin_on_owner(sem, owner)) break; @@ -381,7 +401,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - arch_mutex_cpu_relax(); + cpu_relax_lowlatency(); } osq_unlock(&sem->osq); done: @@ -430,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = ACCESS_ONCE(sem->count); + count = READ_ONCE(sem->count); /* * If there were already threads queued before us and there are @@ -465,6 +485,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_down_write_failed); /* * handle waking up a waiter on the semaphore @@ -485,6 +506,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_wake); /* * downgrade a write lock into a read lock @@ -506,8 +528,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) return sem; } - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -9,29 +9,9 @@ #include <linux/sched.h> #include <linux/export.h> #include <linux/rwsem.h> - #include <linux/atomic.h> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - sem->owner = current; -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - sem->owner = NULL; -} - -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} -#endif +#include "rwsem.h" /* * lock for reading diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h @@ -0,0 +1,20 @@ +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..b8120abe594b 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -36,7 +36,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem); /** @@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock); /** * down_timeout - acquire the semaphore within a specified time * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing + * @timeout: how long to wait before failing * * Attempts to acquire the semaphore. If no more tasks are allowed to * acquire the semaphore, calling this function will put the task to sleep. * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long jiffies) +int down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies) if (likely(sem->count > 0)) sem->count--; else - result = __down_timeout(sem, jiffies); + result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); return result; @@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem) return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) { - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); + return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } static noinline void __sched __up(struct semaphore *sem) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..db3ccb1dd614 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh_nested); + unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/module.c b/kernel/module.c index 81e727cf6df9..42a1d2afb217 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,7 +42,6 @@ #include <linux/vermagic.h> #include <linux/notifier.h> #include <linux/sched.h> -#include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> @@ -60,7 +59,6 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> -#include <linux/fips.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -99,7 +97,7 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete and add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -136,7 +134,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; @@ -159,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; -int register_module_notifier(struct notifier_block * nb) +int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); -int unregister_module_notifier(struct notifier_block * nb) +int unregister_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&module_notify_list, nb); } @@ -389,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms, pr_warn("Symbol %s is marked as UNUSED, however this module is " "using it.\n", fsa->name); pr_warn("This symbol will go away in the future.\n"); - pr_warn("Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " + pr_warn("Please evaluate if this is the right api to use and " + "if it really is, submit a report to the linux kernel " + "mailing list together with submitting your code for " "inclusion.\n"); } #endif @@ -629,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); +/* MODULE_REF_BASE is the base reference count by kmodule loader. */ +#define MODULE_REF_BASE 1 + /* Init the unload section of the module. */ static int module_unload_init(struct module *mod) { - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) - return -ENOMEM; + /* + * Initialize reference counter to MODULE_REF_BASE. + * refcnt == 0 means module is going. + */ + atomic_set(&mod->refcnt, MODULE_REF_BASE); INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - raw_cpu_write(mod->refptr->incs, 1); + atomic_inc(&mod->refcnt); return 0; } @@ -722,8 +725,6 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); - - free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -741,60 +742,48 @@ static inline int try_force_unload(unsigned int flags) } #endif /* CONFIG_MODULE_FORCE_UNLOAD */ -struct stopref +/* Try to release refcount of module, 0 means success. */ +static int try_release_module_ref(struct module *mod) { - struct module *mod; - int flags; - int *forced; -}; + int ret; -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; + /* Try to decrement refcnt which we set at loading */ + ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); + BUG_ON(ret < 0); + if (ret) + /* Someone can put this right now, recover with checking */ + ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); + + return ret; +} +static int try_stop_module(struct module *mod, int flags, int *forced) +{ /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) + if (try_release_module_ref(mod) != 0) { + *forced = try_force_unload(flags); + if (!(*forced)) return -EWOULDBLOCK; } /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - struct stopref sref = { mod, flags, forced }; + mod->state = MODULE_STATE_GOING; - return stop_machine(__try_stop_module, &sref, NULL); + return 0; } -unsigned long module_refcount(struct module *mod) +/** + * module_refcount - return the refcount or -1 if unloading + * + * @mod: the module we're checking + * + * Returns: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) { - unsigned long incs = 0, decs = 0; - int cpu; - - for_each_possible_cpu(cpu) - decs += per_cpu_ptr(mod->refptr, cpu)->decs; - /* - * ensure the incs are added up after the decs. - * module_put ensures incs are visible before decs with smp_wmb. - * - * This 2-count scheme avoids the situation where the refcount - * for CPU0 is read, then CPU0 increments the module refcount, - * then CPU1 drops that refcount, then the refcount for CPU1 is - * read. We would record a decrement but not its corresponding - * increment so we would see a low count (disaster). - * - * Rare situation? But module_refcount can be preempted, and we - * might be tallying up 4096+ CPUs. So it is not impossible. - */ - smp_rmb(); - for_each_possible_cpu(cpu) - incs += per_cpu_ptr(mod->refptr, cpu)->incs; - return incs - decs; + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -876,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %lu ", module_refcount(mod)); + seq_printf(m, " %i ", module_refcount(mod)); - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; seq_printf(m, "%s,", use->source->name); @@ -887,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) if (mod->init != NULL && mod->exit == NULL) { printed_something = 1; - seq_printf(m, "[permanent],"); + seq_puts(m, "[permanent],"); } if (!printed_something) - seq_printf(m, "-"); + seq_puts(m, "-"); } void __symbol_put(const char *symbol) @@ -926,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } static struct module_attribute modinfo_refcnt = @@ -936,7 +927,7 @@ void __module_get(struct module *module) { if (module) { preempt_disable(); - __this_cpu_inc(module->refptr->incs); + atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); preempt_enable(); } @@ -949,11 +940,11 @@ bool try_module_get(struct module *module) if (module) { preempt_disable(); - - if (likely(module_is_live(module))) { - __this_cpu_inc(module->refptr->incs); + /* Note: here, we can fail to get a reference */ + if (likely(module_is_live(module) && + atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); - } else + else ret = false; preempt_enable(); @@ -964,11 +955,12 @@ EXPORT_SYMBOL(try_module_get); void module_put(struct module *module) { + int ret; + if (module) { preempt_disable(); - smp_wmb(); /* see comment in module_refcount */ - __this_cpu_inc(module->refptr->decs); - + ret = atomic_dec_if_positive(&module->refcnt); + WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); preempt_enable(); } @@ -979,7 +971,7 @@ EXPORT_SYMBOL(module_put); static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); + seq_puts(m, " - -"); } static inline void module_unload_free(struct module *mod) @@ -1132,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc, static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1166,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs, return 0; bad_version: - printk("%s: disagrees about version of symbol %s\n", + pr_warn("%s: disagrees about version of symbol %s\n", mod->name, symname); return 0; } @@ -1201,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1233,6 +1225,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, const unsigned long *crc; int err; + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); @@ -1289,15 +1287,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; } -struct module_sect_attr -{ +struct module_sect_attr { struct module_attribute mattr; char *name; unsigned long address; }; -struct module_sect_attrs -{ +struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; struct module_sect_attr attrs[0]; @@ -1551,7 +1547,8 @@ static int module_add_modinfo_attrs(struct module *mod) (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); ++temp_attr; } } @@ -1567,7 +1564,7 @@ static void module_remove_modinfo_attrs(struct module *mod) /* pick a field to test for end of list */ if (!attr->attr.name) break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); if (attr->free) attr->free(mod); } @@ -1698,18 +1695,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - module_bug_cleanup(mod); - return 0; -} - #ifdef CONFIG_DEBUG_SET_MODULE_RONX /* * LKM RO/NX protection: protect module's text/ro-data @@ -1825,7 +1810,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif -void __weak module_free(struct module *mod, void *module_region) +void __weak module_memfree(void *module_region) { vfree(module_region); } @@ -1834,6 +1819,10 @@ void __weak module_arch_cleanup(struct module *mod) { } +void __weak module_arch_freeing_init(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1843,7 +1832,9 @@ static void free_module(struct module *mod) /* We leave it in list to prevent duplicate loads, but make sure * that noone uses it while it's being deconstructed. */ + mutex_lock(&module_mutex); mod->state = MODULE_STATE_UNFORMED; + mutex_unlock(&module_mutex); /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1859,21 +1850,27 @@ static void free_module(struct module *mod) /* Now we can delete it from the lists */ mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + /* Remove this module from bug list, this uses list_del_rcu */ + module_bug_cleanup(mod); + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes: */ + /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); + module_memfree(mod->module_core); #ifdef CONFIG_MPU update_protections(current->mm); @@ -1954,7 +1951,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); - printk("%s: please compile with -fno-common\n", + pr_warn("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; break; @@ -2258,7 +2255,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum) { const Elf_Shdr *sec; @@ -2314,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_size += strtab_size; + mod->core_size = debug_align(mod->core_size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; + mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info) } /* Not having a signature is only an error if we're strict. */ - if (err < 0 && fips_enabled) - panic("Module verification failed with error %d in FIPS mode\n", - err); if (err == -ENOKEY && !sig_enforce) err = 0; @@ -2483,6 +2479,23 @@ static int elf_header_check(struct load_info *info) return 0; } +#define COPY_CHUNK_SIZE (16*PAGE_SIZE) + +static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) +{ + do { + unsigned long n = min(len, COPY_CHUNK_SIZE); + + if (copy_from_user(dst, usrc, n) != 0) + return -EFAULT; + cond_resched(); + dst += n; + usrc += n; + len -= n; + } while (len); + return 0; +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2498,11 +2511,12 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = vmalloc(info->len); + info->hdr = __vmalloc(info->len, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; - if (copy_from_user(info->hdr, umod, info->len) != 0) { + if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { vfree(info->hdr); return -EFAULT; } @@ -2737,7 +2751,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) * This shouldn't happen with same compiler and binutils * building all parts of the module. */ - printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + pr_warn("%s: has both .ctors and .init_array.\n", mod->name); return -EINVAL; } @@ -2757,6 +2771,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); + mod->trace_enums = section_objs(info, "_ftrace_enum_map", + sizeof(*mod->trace_enums), + &mod->num_trace_enums); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", @@ -2811,7 +2828,7 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_free(mod, mod->module_core); + module_memfree(mod->module_core); return -ENOMEM; } memset(ptr, 0, mod->init_size); @@ -2956,8 +2973,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); + module_memfree(mod->module_core); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2989,6 +3007,12 @@ static bool finished_loading(const char *name) struct module *mod; bool ret; + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); ret = !mod || mod->state == MODULE_STATE_LIVE @@ -3009,10 +3033,36 @@ static void do_mod_ctors(struct module *mod) #endif } -/* This is where the real work happens */ -static int do_init_module(struct module *mod) +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct rcu_head rcu; + void *module_init; +}; + +static void do_free_init(struct rcu_head *head) +{ + struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); + module_memfree(m->module_init); + kfree(m); +} + +/* + * This is where the real work happens. + * + * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb + * helper command 'lx-symbols'. + */ +static noinline int do_init_module(struct module *mod) { int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->module_init; /* * We want to find out whether @mod uses async during init. Clear @@ -3025,16 +3075,7 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; + goto fail_free_freeinit; } if (ret > 0) { pr_warn("%s: '%s'->init suspiciously returned %d, it should " @@ -3079,15 +3120,35 @@ static int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; mod->init_text_size = 0; + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, + * we call synchronize_rcu/synchronize_sched, but we don't want + * to slow down the success path, so use actual RCU here. + */ + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; } static int may_init_module(void) @@ -3178,7 +3239,7 @@ out: static int unknown_module_param_cb(char *param, char *val, const char *modname) { - /* Check for magic 'dyndbg' arg */ + /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); @@ -3218,7 +3279,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->sig_ok = info->sig_ok; if (!mod->sig_ok) { pr_notice_once("%s: module verification failed: signature " - "and/or required key missing - tainting " + "and/or required key missing - tainting " "kernel\n", mod->name); add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } @@ -3308,6 +3369,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); @@ -3323,8 +3389,13 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); wake_up_all(&module_wq); + /* Wait for RCU synchronizing before releasing mod->list. */ + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->module_core, mod->core_size); + module_deallocate(mod, info); free_copy: free_copy(info); @@ -3385,7 +3456,9 @@ static inline int within(unsigned long addr, void *start, unsigned long size) */ static inline int is_arm_mapping_symbol(const char *str) { - return str[0] == '$' && strchr("atd", str[1]) + if (str[0] == '.' && str[1] == 'L') + return true; + return str[0] == '$' && strchr("axtd", str[1]) && (str[2] == '\0' || str[2] == '.'); } @@ -3448,8 +3521,7 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -3473,8 +3545,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -3499,8 +3570,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -3657,8 +3727,8 @@ static int m_show(struct seq_file *m, void *p) /* Informative for users. */ seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%pK", mod->module_core); @@ -3667,7 +3737,7 @@ static int m_show(struct seq_file *m, void *p) if (mod->taints) seq_printf(m, " %s", module_flags(mod, buf)); - seq_printf(m, "\n"); + seq_puts(m, "\n"); return 0; } @@ -3764,8 +3834,7 @@ struct module *__module_address(unsigned long addr) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) + if (within_module(addr, mod)) return mod; } return NULL; diff --git a/kernel/notifier.c b/kernel/notifier.c index 4803da6eab62..ae9fc7cc360e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); +#endif /* CONFIG_SRCU */ + static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..49746c81ad8d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) @@ -227,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_ns *ei; struct file *file; + struct ns_common *ns; int err; file = proc_ns_fget(fd); @@ -239,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = get_proc_ns(file_inode(file)); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) + ns = get_proc_ns(file_inode(file)); + if (nstype && (ns->ops->type != nstype)) goto out; new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); @@ -250,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } - err = ops->install(new_nsproxy, ei->ns); + err = ns->ops->install(new_nsproxy, ns); if (err) { free_nsproxy(new_nsproxy); goto out; diff --git a/kernel/padata.c b/kernel/padata.c index 161402f0b517..b38bea9c466a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst, else cpumask = pinst->cpumask.pcpu; - len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), - nr_cpu_ids); - if (PAGE_SIZE - len < 2) - len = -EINVAL; - else - len += sprintf(buf + len, "\n"); - + len = snprintf(buf, PAGE_SIZE, "%*pb\n", + nr_cpu_ids, cpumask_bits(cpumask)); mutex_unlock(&pinst->lock); - return len; + return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t store_cpumask(struct padata_instance *pinst, diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..8136ad76e5fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); static bool crash_kexec_post_notifiers; +int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -224,6 +225,8 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, + { TAINT_LIVEPATCH, 'K', ' ' }, }; /** @@ -243,6 +246,8 @@ static const struct tnt tnts[] = { * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. + * 'L' - A soft lockup has previously occurred. + * 'K' - Kernel has been live patched. * * The string is overwritten by the next call to print_tainted(). */ @@ -426,6 +431,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, if (args) vprintk(args->fmt, args->args); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } + print_modules(); dump_stack(); print_oops_end_marker(); @@ -483,6 +499,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +core_param(panic_on_warn, panic_on_warn, int, 0644); static int __init setup_crash_kexec_post_notifiers(char *s) { diff --git a/kernel/params.c b/kernel/params.c index 1e52ca233fd9..a22d6a759b1a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -19,6 +19,7 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> @@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } +static void param_check_unsafe(const struct kernel_param *kp) +{ + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { + pr_warn("Setting dangerous option %s - tainting kernel\n", + kp->name); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } +} + static int parse_one(char *param, char *val, const char *doing, @@ -104,11 +114,12 @@ static int parse_one(char *param, return 0; /* No one handled NULL, so do it here. */ if (!val && - !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) + !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); mutex_lock(¶m_lock); + param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); return err; @@ -162,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val) if (args[i-1] == '"') args[i-1] = '\0'; } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; @@ -256,6 +267,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { @@ -317,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -368,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; @@ -502,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string); #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) -extern struct kernel_param __start___param[], __stop___param[]; - struct param_attribute { struct module_attribute mattr; @@ -551,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; mutex_lock(¶m_lock); + param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); mutex_unlock(¶m_lock); if (!err) @@ -592,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { - struct module_param_attrs *new; - struct attribute **attrs; - int err, num; + struct module_param_attrs *new_mp; + struct attribute **new_attrs; + unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { - num = 0; - attrs = NULL; - } else { - num = mk->mp->num; - attrs = mk->mp->grp.attrs; + /* First allocation. */ + mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); + if (!mk->mp) + return -ENOMEM; + mk->mp->grp.name = "parameters"; + /* NULL-terminated attribute array. */ + mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), + GFP_KERNEL); + /* Caller will cleanup via free_module_param_attrs */ + if (!mk->mp->grp.attrs) + return -ENOMEM; } - /* Enlarge. */ - new = krealloc(mk->mp, - sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), - GFP_KERNEL); - if (!new) { - kfree(attrs); - err = -ENOMEM; - goto fail; - } - /* Despite looking like the typical realloc() bug, this is safe. - * We *want* the old 'attrs' to be freed either way, and we'll store - * the new one in the success case. */ - attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); - if (!attrs) { - err = -ENOMEM; - goto fail_free_new; - } + /* Enlarge allocations. */ + new_mp = krealloc(mk->mp, + sizeof(*mk->mp) + + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + GFP_KERNEL); + if (!new_mp) + return -ENOMEM; + mk->mp = new_mp; - /* Sysfs wants everything zeroed. */ - memset(new, 0, sizeof(*new)); - memset(&new->attrs[num], 0, sizeof(new->attrs[num])); - memset(&attrs[num], 0, sizeof(attrs[num])); - new->grp.name = "parameters"; - new->grp.attrs = attrs; + /* Extra pointer for NULL terminator */ + new_attrs = krealloc(mk->mp->grp.attrs, + sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), + GFP_KERNEL); + if (!new_attrs) + return -ENOMEM; + mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - sysfs_attr_init(&new->attrs[num].mattr.attr); - new->attrs[num].param = kp; - new->attrs[num].mattr.show = param_attr_show; - new->attrs[num].mattr.store = param_attr_store; - new->attrs[num].mattr.attr.name = (char *)name; - new->attrs[num].mattr.attr.mode = kp->perm; - new->num = num+1; + memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); + sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); + mk->mp->attrs[mk->mp->num].param = kp; + mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + /* Do not allow runtime DAC changes to make param writable. */ + if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + else + mk->mp->attrs[mk->mp->num].mattr.store = NULL; + mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; + mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ - for (num = 0; num < new->num; num++) - new->grp.attrs[num] = &new->attrs[num].mattr.attr; - new->grp.attrs[num] = NULL; - - mk->mp = new; + for (i = 0; i < mk->mp->num; i++) + mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; + mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; - -fail_free_new: - kfree(new); -fail: - mk->mp = NULL; - return err; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { - kfree(mk->mp->grp.attrs); + if (mk->mp) + kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } @@ -684,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod, if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); - if (err) + if (err) { + free_module_param_attrs(&mod->mkobj); return err; + } params = true; } @@ -762,7 +771,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name) } static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, + const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; @@ -797,7 +806,7 @@ static void __init kernel_add_sysfs_param(const char *name, */ static void __init param_sysfs_builtin(void) { - struct kernel_param *kp; + const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a26698144..4fd07d5b7baf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, + .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_NS + .ns.ops = &pidns_operations, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -179,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) spin_unlock_irq(&pidmap_lock); kfree(page); if (unlikely(!map->page)) - break; + return -ENOMEM; } if (likely(atomic_read(&map->nr_free))) { for ( ; ; ) { @@ -207,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) } pid = mk_pid(pid_ns, map, offset); } - return -1; + return -EAGAIN; } int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) @@ -298,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns) int i, nr; struct pid_namespace *tmp; struct upid *upid; + int retval = -ENOMEM; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) - goto out; + return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (nr < 0) + if (IS_ERR_VALUE(nr)) { + retval = nr; goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; @@ -336,18 +342,18 @@ struct pid *alloc_pid(struct pid_namespace *ns) } spin_unlock_irq(&pidmap_lock); -out: return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + put_pid_ns(ns); + out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; + return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761b..a65ba137fd15 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) goto out_free_map; + ns->ns.ops = &pidns_operations; kref_init(&ns->kref); ns->level = level; @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); - /* Ignore SIGCHLD causing any terminated children to autoreap */ + /* + * Ignore SIGCHLD causing any terminated children to autoreap. + * This speeds up the namespace shutdown, plus see the comment + * below. + */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); @@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); - /* Firstly reap the EXIT_ZOMBIE children we may have. */ + /* + * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. + * sys_wait4() will also block until our children traced from the + * parent namespace are detached and become EXIT_DEAD. + */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see free_pid(). + * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * really care, we could reparent them to the global init. We could + * exit and reap ->child_reaper even if it is not the last thread in + * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * pid_ns can not go away until proc_kill_sb() drops the reference. + * + * But this ns can also have other tasks injected by setns()+fork(). + * Again, ignoring the user visible semantics we do not really need + * to wait until they are all reaped, but they can be reparented to + * us and thus we need to ensure that pid->child_reaper stays valid + * until they all go away. See free_pid()->wake_up_process(). + * + * We rely on ignored SIGCHLD, an injected zombie must be autoreaped + * if reparented. */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static void *pidns_get(struct task_struct *task) +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + +static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task) get_pid_ns(ns); rcu_read_unlock(); - return ns; + return ns ? &ns->ns : NULL; } -static void pidns_put(void *ns) +static void pidns_put(struct ns_common *ns) { - put_pid_ns(ns); + put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, void *ns) +static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = ns; + struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int pidns_inum(void *ns) -{ - struct pid_namespace *pid_ns = ns; - return pid_ns->proc_inum; -} - const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, - .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d780facd..7e01f78f0417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS + select PM config PM_SLEEP_SMP def_bool y @@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on !IA64_HP_SIM +config PM + bool "Device power management core functionality" ---help--- Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated + (low power) states, for example after a specified period of inactivity + (autosuspended), and woken up in response to a hardware-generated wake-up event or a driver's request. Hardware support is generally required for this functionality to work and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and + responsible for the actual handling of device suspend requests and wake-up events. -config PM - def_bool y - depends on PM_SLEEP || PM_RUNTIME - config PM_DEBUG bool "Power Management Debug Support" depends on PM @@ -253,11 +249,9 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config ARCH_HAS_OPP - bool - config PM_OPP bool + select SRCU ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This @@ -301,10 +295,9 @@ config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS -config PM_GENERIC_DOMAINS_RUNTIME +config PM_GENERIC_DOMAINS_OF def_bool y - depends on PM_RUNTIME && PM_GENERIC_DOMAINS + depends on PM_GENERIC_DOMAINS && OF config CPU_PM bool - depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fcc2611d3f14..2329daae5255 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/ktime.h> #include <trace/events/power.h> #include "power.h" @@ -232,20 +233,17 @@ static void platform_recover(int platform_mode) * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) +void swsusp_show_speed(ktime_t start, ktime_t stop, + unsigned nr_pages, char *msg) { + ktime_t diff; u64 elapsed_centisecs64; unsigned int centisecs; unsigned int k; unsigned int kps; - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - /* - * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, - * it is obvious enough for what went wrong. - */ - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + diff = ktime_sub(stop, start); + elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ @@ -371,7 +369,6 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -397,7 +394,6 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - ftrace_start(); resume_console(); dpm_complete(msg); @@ -500,15 +496,19 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); - ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -535,7 +535,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -579,7 +578,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - ftrace_start(); resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f330f139..86e8157a450f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,7 +11,7 @@ #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/resume-trace.h> +#include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (pm_states[i].state) - s += sprintf(s,"%s ", pm_states[i].label); + if (pm_states[i]) + s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) @@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_MIN; - struct pm_sleep_state *s; + suspend_state_t state; #endif char *p; int len; @@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (s->state && len == strlen(s->label) - && !strncmp(buf, s->label, len)) - return s->state; + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } #endif return PM_SUSPEND_ON; @@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state].state ? - pm_states[state].label : "error"); + return sprintf(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); @@ -615,7 +616,6 @@ static struct attribute_group attr_group = { .attrs = g, }; -#ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); @@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void) return pm_wq ? 0 : -ENOMEM; } -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif static int __init pm_init(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b5270a..ce9b8328a689 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -174,17 +174,12 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain); struct timeval; /* kernel/power/swsusp.c */ -extern void swsusp_show_speed(struct timeval *, struct timeval *, - unsigned int, char *); +extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND -struct pm_sleep_state { - const char *label; - suspend_state_t state; -}; - /* kernel/power/suspend.c */ -extern struct pm_sleep_state pm_states[]; +extern const char *pm_labels[]; +extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = elapsed_msecs64; if (todo) { - printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " + pr_cont("\n"); + pr_err("Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, @@ -93,15 +93,15 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); } @@ -129,17 +129,25 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - printk("Freezing user space processes ... "); + pm_wakeup_clear(); + pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); + pr_cont("done."); } - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; @@ -157,13 +165,14 @@ int freeze_kernel_threads(void) { int error; - printk("Freezing remaining freezable tasks ... "); + pr_info("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) - printk("done."); + pr_cont("done."); - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -184,17 +193,17 @@ void thaw_processes(void) oom_killer_enable(); - printk("Restarting tasks ... "); + pr_info("Restarting tasks ... "); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -203,7 +212,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -212,17 +221,17 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - printk("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ... "); thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); - printk("done.\n"); + pr_cont("done.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -41,6 +41,8 @@ #include <linux/platform_device.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -105,11 +107,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +148,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +161,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); @@ -157,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } +static inline int pm_qos_get_value(struct pm_qos_constraints *c); +static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) +{ + struct pm_qos_object *qos = (struct pm_qos_object *)s->private; + struct pm_qos_constraints *c; + struct pm_qos_request *req; + char *type; + unsigned long flags; + int tot_reqs = 0; + int active_reqs = 0; + + if (IS_ERR_OR_NULL(qos)) { + pr_err("%s: bad qos param!\n", __func__); + return -EINVAL; + } + c = qos->constraints; + if (IS_ERR_OR_NULL(c)) { + pr_err("%s: Bad constraints on qos?\n", __func__); + return -EINVAL; + } + + /* Lock to ensure we have a snapshot */ + spin_lock_irqsave(&pm_qos_lock, flags); + if (plist_head_empty(&c->list)) { + seq_puts(s, "Empty!\n"); + goto out; + } + + switch (c->type) { + case PM_QOS_MIN: + type = "Minimum"; + break; + case PM_QOS_MAX: + type = "Maximum"; + break; + case PM_QOS_SUM: + type = "Sum"; + break; + default: + type = "Unknown"; + } + + plist_for_each_entry(req, &c->list, node) { + char *state = "Default"; + + if ((req->node).prio != c->default_value) { + active_reqs++; + state = "Active"; + } + tot_reqs++; + seq_printf(s, "%d: %d: %s\n", tot_reqs, + (req->node).prio, state); + } + + seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", + type, pm_qos_get_value(c), active_reqs, tot_reqs); + +out: + spin_unlock_irqrestore(&pm_qos_lock, flags); + return 0; +} + +static int pm_qos_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, pm_qos_dbg_show_requests, + inode->i_private); +} + +static const struct file_operations pm_qos_debug_fops = { + .open = pm_qos_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * pm_qos_update_target - manages the constraints list and calls the notifiers * if needed @@ -484,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); /* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos) +static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) { qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + if (d) { + (void)debugfs_create_file(qos->name, S_IRUGO, d, + (void *)qos, &pm_qos_debug_fops); + } + return misc_register(&qos->pm_qos_power_miscdev); } @@ -583,11 +690,16 @@ static int __init pm_qos_power_init(void) { int ret = 0; int i; + struct dentry *d; BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + d = debugfs_create_dir("pm_qos", NULL); + if (IS_ERR_OR_NULL(d)) + d = NULL; + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i]); + ret = register_pm_qos_misc(pm_qos_array[i], d); if (ret < 0) { printk(KERN_ERR "pm_qos_param: %s setup failed\n", pm_qos_array[i]->name); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328aafdc9..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/compiler.h> +#include <linux/ktime.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -248,33 +249,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * information is stored (in the form of a block of bitmap) * It also contains the pfns that correspond to the start and end of * the represented memory area. + * + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; }; -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct bm_block *block; - int bit; + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + int node_bit; }; struct memory_bitmap { - struct list_head blocks; /* list of bitmap blocks */ + struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -284,38 +313,178 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static void memory_bm_position_reset(struct memory_bitmap *bm) +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/* + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) { - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; -} + struct rtree_node *node; -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; -/** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} + +/* + * add_rtree_block - Add a new leave node to the radix tree + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) { - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; + + block_nr = zone->blocks; + levels_needed = 0; - while (nr_blocks-- > 0) { - struct bm_block *bb; + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) return -ENOMEM; - list_add(&bb->hook, list); + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; + } + + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; } + zone->blocks += 1; + *dst = block; + return 0; } +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/* + * create_zone_bm_rtree - create a radix tree for one zone + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree * +create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + unsigned long start, unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/* + * free_zone_bm_rtree - Free the memory of the radix tree + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + struct mem_extent { struct list_head hook; unsigned long start; @@ -407,40 +576,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) int error; chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); if (error) return error; list_for_each_entry(ext, &mem_extents, hook) { - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; - - bb = list_entry(bm->blocks.prev, struct bm_block, hook); + struct mem_zone_bm_rtree *zone; - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) { + error = -ENOMEM; goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; } + list_add_tail(&zone->list, &bm->zones); } bm->p_list = ca.chain; @@ -460,51 +611,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct bm_block *bb; + struct mem_zone_bm_rtree *zone; - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); free_list_of_pages(bm->p_list, clear_nosave_free); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); } /** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. + * memory_bm_find_bit - Find the bit for pfn in the memory + * bitmap + * + * Find the bit in the bitmap @bm that corresponds to given pfn. + * The cur.zone, cur.block and cur.node_pfn member of @bm are + * updated. + * It walks the radix tree to find the page which contains the bit for + * pfn and returns the bit position in **addr and *bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) + void **addr, unsigned int *bit_nr) { - struct bm_block *bb; + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + + if (!zone) + return -EFAULT; + +zone_found: /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. + * We have a zone. Now walk the radix tree to find the leave + * node for our pfn. */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; - if (&bb->hook == &bm->blocks) - return -EFAULT; + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; return 0; } @@ -528,6 +711,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); + return error; } @@ -542,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -561,38 +753,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) return !memory_bm_find_bit(bm, pfn, &addr, &bit); } -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. +/* + * rtree_next_node - Jumps to the next leave node * - * It is required to run memory_bm_position_reset() before the first call to - * this function. + * Sets the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. + * + * Returns true if there is a next node, false otherwise. */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + touch_softlockup_watchdog(); + return true; + } + /* No more nodes, goto next zone */ + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + + /* No more zones */ + return false; +} + +/** + * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * + * Starting from the last returned position this function searches + * for the next set bit in the memory bitmap and returns its + * number. If no more bit is set BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the + * first call to this function. + */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct bm_block *bb; + unsigned long bits, pfn, pages; int bit; - bb = bm->cur.block; do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + return pfn; + } + } while (rtree_next_node(bm)); - memory_bm_position_reset(bm); return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; } /** @@ -816,12 +1040,17 @@ void free_basic_memory_bitmaps(void) unsigned int snapshot_additional_pages(struct zone *zone) { - unsigned int res; + unsigned int rtree, nodes; + + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); - return 2 * res; + return 2 * rtree; } #ifdef CONFIG_HIGHMEM @@ -1094,23 +1323,39 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + if (!forbidden_pages_map || !free_pages_map) + goto out; + + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + +out: nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; @@ -1208,9 +1453,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, /** * free_unnecessary_pages - Release preallocated pages not needed for the image */ -static void free_unnecessary_pages(void) +static unsigned long free_unnecessary_pages(void) { - unsigned long save, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem, free; save = count_data_pages(); if (alloc_normal >= save) { @@ -1231,6 +1476,7 @@ static void free_unnecessary_pages(void) else to_free_normal = 0; } + free = to_free_normal + to_free_highmem; memory_bm_position_reset(©_bm); @@ -1254,6 +1500,8 @@ static void free_unnecessary_pages(void) swsusp_unset_page_free(page); __free_page(page); } + + return free; } /** @@ -1313,11 +1561,11 @@ int hibernate_preallocate_memory(void) struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; unsigned long alloc, save_highmem, pages_highmem, avail_normal; - struct timeval start, stop; + ktime_t start, stop; int error; printk(KERN_INFO "PM: Preallocating image memory... "); - do_gettimeofday(&start); + start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); if (error) @@ -1443,12 +1691,12 @@ int hibernate_preallocate_memory(void) * pages in memory, but we have allocated more. Release the excessive * ones now. */ - free_unnecessary_pages(); + pages -= free_unnecessary_pages(); out: - do_gettimeofday(&stop); + stop = ktime_get(); printk(KERN_CONT "done (allocated %lu pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Allocated"); + swsusp_show_speed(start, stop, pages, "Allocated"); return 0; @@ -2046,8 +2294,6 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static inline int get_safe_write_buffer(void) { return 0; } - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed35a4790afe..8d7a1ef72758 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -28,25 +28,19 @@ #include <linux/ftrace.h> #include <trace/events/power.h> #include <linux/compiler.h> +#include <linux/moduleparam.h> #include "power.h" -struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { .label = "standby", }, - [PM_SUSPEND_MEM] = { .label = "mem", }, -}; +const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; - -static bool need_suspend_ops(suspend_state_t state) -{ - return state > PM_SUSPEND_FREEZE; -} - static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -static bool suspend_freeze_wake; + +enum freeze_state __read_mostly suspend_freeze_state; +static DEFINE_SPINLOCK(suspend_freeze_lock); void freeze_set_ops(const struct platform_freeze_ops *ops) { @@ -57,22 +51,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops) static void freeze_begin(void) { - suspend_freeze_wake = false; + suspend_freeze_state = FREEZE_STATE_NONE; } static void freeze_enter(void) { - cpuidle_use_deepest_state(true); + spin_lock_irq(&suspend_freeze_lock); + if (pm_wakeup_pending()) + goto out; + + suspend_freeze_state = FREEZE_STATE_ENTER; + spin_unlock_irq(&suspend_freeze_lock); + + get_online_cpus(); cpuidle_resume(); - wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + pr_debug("PM: suspend-to-idle\n"); + /* Make the current CPU wait so it can enter the idle loop too. */ + wait_event(suspend_freeze_wait_head, + suspend_freeze_state == FREEZE_STATE_WAKE); + pr_debug("PM: resume from suspend-to-idle\n"); + cpuidle_pause(); - cpuidle_use_deepest_state(false); + put_online_cpus(); + + spin_lock_irq(&suspend_freeze_lock); + + out: + suspend_freeze_state = FREEZE_STATE_NONE; + spin_unlock_irq(&suspend_freeze_lock); } void freeze_wake(void) { - suspend_freeze_wake = true; - wake_up(&suspend_freeze_wait_head); + unsigned long flags; + + spin_lock_irqsave(&suspend_freeze_lock, flags); + if (suspend_freeze_state > FREEZE_STATE_NONE) { + suspend_freeze_state = FREEZE_STATE_WAKE; + wake_up(&suspend_freeze_wait_head); + } + spin_unlock_irqrestore(&suspend_freeze_lock, flags); } EXPORT_SYMBOL_GPL(freeze_wake); @@ -97,10 +118,7 @@ static bool relative_states; static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - if (relative_states) { - pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; - pm_states[PM_SUSPEND_FREEZE].state = 0; - } + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -113,20 +131,20 @@ __setup("relative_sleep_states=", sleep_states_setup); void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; - int j = PM_SUSPEND_MAX - 1; + int j = 0; lock_system_sleep(); suspend_ops = ops; for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) - pm_states[j--].state = i; - else if (!relative_states) - pm_states[j--].state = 0; + if (valid_state(i)) { + pm_states[i] = pm_labels[j++]; + } else if (!relative_states) { + pm_states[i] = NULL; + j++; + } - pm_states[j--].state = PM_SUSPEND_FREEZE; - while (j >= PM_SUSPEND_MIN) - pm_states[j--].state = 0; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; unlock_system_sleep(); } @@ -145,12 +163,91 @@ int suspend_valid_only_mem(suspend_state_t state) } EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? + freeze_ops->prepare() : 0; +} + +static int platform_suspend_prepare_noirq(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? + suspend_ops->prepare_late() : 0; +} + +static void platform_resume_noirq(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) + suspend_ops->wake(); +} + +static void platform_resume_early(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) + freeze_ops->restore(); +} + +static void platform_resume_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) + return freeze_ops->begin(); + else if (suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_resume_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + else if (suspend_ops->end) + suspend_ops->end(); +} + +static void platform_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + +#ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from suspend test"); +#endif + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); + printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -168,7 +265,7 @@ static int suspend_prepare(suspend_state_t state) { int error; - if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); @@ -214,23 +311,27 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (need_suspend_ops(state) && suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; - error = dpm_suspend_end(PMSG_SUSPEND); + error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); + printk(KERN_ERR "PM: late suspend of devices failed\n"); goto Platform_finish; } + error = platform_suspend_prepare_late(state); + if (error) + goto Devices_early_resume; - if (need_suspend_ops(state) && suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + goto Platform_early_resume; } + error = platform_suspend_prepare_noirq(state); + if (error) + goto Platform_wake; if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -248,7 +349,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - ftrace_stop(); error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -275,18 +375,19 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Enable_cpus: enable_nonboot_cpus(); - ftrace_start(); Platform_wake: - if (need_suspend_ops(state) && suspend_ops->wake) - suspend_ops->wake(); + platform_resume_noirq(state); + dpm_resume_noirq(PMSG_RESUME); - dpm_resume_start(PMSG_RESUME); + Platform_early_resume: + platform_resume_early(state); - Platform_finish: - if (need_suspend_ops(state) && suspend_ops->finish) - suspend_ops->finish(); + Devices_early_resume: + dpm_resume_early(PMSG_RESUME); + Platform_finish: + platform_resume_finish(state); return error; } @@ -299,18 +400,13 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (need_suspend_ops(state) && !suspend_ops) + if (!sleep_state_supported(state)) return -ENOSYS; - if (need_suspend_ops(state) && suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { - error = freeze_ops->begin(); - if (error) - goto Close; - } + error = platform_suspend_begin(state); + if (error) + goto Close; + suspend_console(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); @@ -324,25 +420,22 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup && need_suspend_ops(state) - && suspend_ops->suspend_again && suspend_ops->suspend_again()); + } while (!error && !wakeup && platform_suspend_again(state)); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + trace_suspend_resume(TPS("resume_console"), state, true); resume_console(); - Close: - if (need_suspend_ops(state) && suspend_ops->end) - suspend_ops->end(); - else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) - freeze_ops->end(); + trace_suspend_resume(TPS("resume_console"), state, false); + Close: + platform_resume_end(state); return error; Recover_platform: - if (need_suspend_ops(state) && suspend_ops->recover) - suspend_ops->recover(); + platform_recover(state); goto Resume_devices; } @@ -395,7 +488,7 @@ static int enter_state(suspend_state_t state) printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); error = suspend_prepare(state); if (error) goto Unlock; @@ -404,7 +497,7 @@ static int enter_state(suspend_state_t state) goto Finish; trace_suspend_resume(TPS("suspend_enter"), state, false); - pr_debug("PM: Entering %s sleep\n", pm_states[state].label); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097e78ea..084452e34a12 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -22,6 +22,8 @@ #define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; +static u32 test_repeat_count_max = 1; +static u32 test_repeat_count_current; void suspend_test_start(void) { @@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) int status; /* this may fail if the RTC hasn't been initialized */ +repeat: status = rtc_read_time(rtc, &alm.time); if (status < 0) { printk(err_readtime, dev_name(&rtc->dev), status); @@ -92,18 +95,29 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state].label); + printk(info_test, pm_states[state]); status = pm_suspend(state); + if (status < 0) + state = PM_SUSPEND_FREEZE; } + if (state == PM_SUSPEND_FREEZE) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) printk(err_suspend, status); + test_repeat_count_current++; + if (test_repeat_count_current < test_repeat_count_max) + goto repeat; + /* Some platforms can't detect that the alarm triggered the * wakeup, or (accordingly) disable it after it afterwards. * It's supposed to give oneshot behavior; cope. @@ -129,24 +143,36 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - suspend_state_t i; + int i; + char *repeat; + char *suspend_type; - /* "=mem" ==> "mem" */ + /* example : "=mem[,N]" ==> "mem[,N]" */ value++; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i].label, value)) { - test_state = pm_states[i].state; + suspend_type = strsep(&value, ","); + if (!suspend_type) + return 0; + + repeat = strsep(&value, ","); + if (repeat) { + if (kstrtou32(repeat, 0, &test_repeat_count_max)) + return 0; + } + + for (i = 0; pm_labels[i]; i++) + if (!strcmp(pm_labels[i], suspend_type)) { + test_state_label = pm_labels[i]; return 0; } - printk(warn_bad_state, value); + printk(warn_bad_state, suspend_type); return 0; } __setup("test_suspend", setup_test_suspend); @@ -158,13 +184,21 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!pm_states[test_state].state) { - printk(warn_bad_state, pm_states[test_state].label); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ @@ -173,13 +207,12 @@ static int __init test_suspend(void) rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aaa3261dea5d..570aff817543 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -30,6 +30,7 @@ #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> +#include <linux/ktime.h> #include "power.h" @@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); @@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) @@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } @@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; size_t off; unsigned thr, run_threads, nr_threads; unsigned char *page = NULL; @@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { @@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle, out_finish: err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: if (crc) { if (crc->thr) @@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle, { unsigned int m; int ret = 0; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; struct bio *bio; int err2; unsigned nr_pages; @@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) { @@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle, if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } @@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle, int ret = 0; int eof = 0; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; @@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -1343,7 +1344,7 @@ out_finish: wait_event(crc->done, atomic_read(&crc->stop)); atomic_set(&crc->stop, 0); } - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) { printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); @@ -1359,7 +1360,7 @@ out_finish: } } } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); @@ -1374,7 +1375,7 @@ out_clean: kthread_stop(data[thr].thr); vfree(data); } - if (page) vfree(page); + vfree(page); return ret; } diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index cbd69d842341..2ca4a8b5fe57 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -3,7 +3,7 @@ struct console_cmdline { - char name[8]; /* Name of the driver */ + char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..c099b082cd02 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,6 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -45,6 +44,8 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -56,14 +57,11 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; -/* Deferred messaged from sched code are marked by this special level */ -#define SCHED_MESSAGE_LOGLEVEL -2 - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -113,9 +111,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +144,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +255,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -270,6 +268,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* Return log buffer address */ +char *log_buf_addr_get(void) +{ + return log_buf; +} + +/* Return log buffer size */ +u32 log_buf_len_get(void) +{ + return log_buf_len; +} + /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -344,7 +354,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +463,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -471,7 +477,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -static int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, bool from_file) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -509,14 +515,13 @@ struct devkmsg_user { char buf[8192]; }; -static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iov_length(iv, count); + size_t len = iov_iter_count(from); ssize_t ret = len; if (len > LOG_LINE_MAX) @@ -525,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, if (buf == NULL) return -ENOMEM; - line = buf; - for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { - ret = -EFAULT; - goto out; - } - line += iv[i].iov_len; + buf[len] = '\0'; + if (copy_from_iter(buf, len, from) != len) { + kfree(buf); + return -EFAULT; } /* @@ -557,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = endp; } } - line[len] = '\0'; printk_emit(facility, level, NULL, 0, "%s", line); -out: kfree(buf); return ret; } @@ -792,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file) const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, - .aio_write = devkmsg_writev, + .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, @@ -828,34 +828,80 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +#ifdef CONFIG_SMP +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) + +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} +#else /* !CONFIG_SMP */ +static inline void log_buf_add_cpu(void) {} +#endif /* CONFIG_SMP */ + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +918,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -881,7 +927,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -889,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str) early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); +MODULE_PARM_DESC(ignore_loglevel, + "ignore loglevel setting (prints all kernel messages to the console)"); #ifdef CONFIG_BOOT_PRINTK_DELAY @@ -947,11 +993,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1214,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; - static int saved_console_loglevel = -1; + static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, from_file); @@ -1271,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) + if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { + if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ @@ -1291,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; error = 0; break; /* Number of chars in the log buffer */ @@ -1310,7 +1352,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1377,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len) } /* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. + * Zap console related locks when oopsing. + * To leave time for slow consoles to print a full oops, + * only zap at most once every 30 seconds. */ static void zap_locks(void) { static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; @@ -1416,10 +1458,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1432,8 +1473,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1476,7 +1519,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1581,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, int printed_len = 0; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ - static volatile unsigned int logbuf_cpu = UINT_MAX; + static unsigned int logbuf_cpu = UINT_MAX; - if (level == SCHED_MESSAGE_LOGLEVEL) { - level = -1; + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; in_sched = true; } @@ -1608,7 +1651,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1617,27 +1661,22 @@ asmlinkage int vprintk_emit(int facility, int level, raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; - if (recursion_bug) { + if (unlikely(recursion_bug)) { static const char recursion_msg[] = "BUG: recent printk recursion!"; recursion_bug = 0; - text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, text_len); + NULL, 0, recursion_msg, + strlen(recursion_msg)); } /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - if (in_sched) - text_len = scnprintf(text, sizeof(textbuf), - KERN_WARNING "[sched_delayed] "); - - text_len += vscnprintf(text + text_len, - sizeof(textbuf) - text_len, fmt, args); + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1653,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *end_of_header = printk_skip_level(text); switch (kern_level) { case '0' ... '7': - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; + /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } @@ -1668,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, } } - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dict) @@ -1716,28 +1756,37 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, -1, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1756,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); +int vprintk_default(const char *fmt, va_list args) +{ + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); + return r; + } +#endif + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + + return r; +} +EXPORT_SYMBOL_GPL(vprintk_default); + +/* + * This allows printk to be diverted to another function per cpu. + * This is useful for calling printk functions from within NMI + * without worrying about race conditions that can lock up the + * box. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; + /** * printk - print a kernel message * @fmt: format string @@ -1779,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit); */ asmlinkage __visible int printk(const char *fmt, ...) { + printk_func_t vprintk_func; va_list args; int r; -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif va_start(args, fmt); - r = vprintk_emit(0, -1, NULL, 0, fmt, args); + + /* + * If a caller overrides the per_cpu printk_func, then it needs + * to disable preemption when calling printk(). Otherwise + * the printk_func should be set to the default. No need to + * disable preemption here. + */ + vprintk_func = this_cpu_read(printk_func); + r = vprintk_func(fmt, args); + va_end(args); return r; @@ -1802,7 +1877,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1825,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +/* Still needs to be defined for users */ +DEFINE_PER_CPU(printk_func_t, printk_func); + #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; + char buf[512]; + int n; + + if (!early_console) + return; va_start(ap, fmt); - early_vprintk(fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); + + early_console->write(early_console, buf, n); } #endif @@ -1881,11 +1956,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1978,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1988,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1940,31 +2017,12 @@ int add_preferred_console(char *name, int idx, char *options) return __add_preferred_console(name, idx, options, NULL); } -int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) -{ - struct console_cmdline *c; - int i; - - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) - if (strcmp(c->name, name) == 0 && c->index == idx) { - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; - } - /* not found */ - return -1; -} - -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2360,9 +2418,6 @@ void register_console(struct console *newcon) if (preferred_console < 0 || bcon || !console_drivers) preferred_console = selected_console; - if (newcon->early_setup) - newcon->early_setup(); - /* * See if we want to use this console driver. If we * didn't select a console we take the first one @@ -2388,22 +2443,27 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; - if (_braille_register_console(newcon, c)) - return; + if (_braille_register_console(newcon, c)) + return; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + break; + } - if (newcon->setup && - newcon->setup(newcon, console_cmdline[i].options) != 0) - break; newcon->flags |= CON_ENABLED; - newcon->index = c->index; if (i == selected_console) { newcon->flags |= CON_CONSDEV; preferred_console = selected_console; @@ -2570,7 +2630,7 @@ void wake_up_klogd(void) preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } @@ -2582,11 +2642,11 @@ int printk_deferred(const char *fmt, ...) preempt_disable(); va_start(args, fmt); - r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); return r; @@ -2618,14 +2678,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c index 54bf5ba26420..a7bcd28d6e9f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -422,8 +422,7 @@ void profile_tick(int type) static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - seq_cpumask(m, prof_cpu_mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf98622cb32..c8e0e050a36a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,12 +28,6 @@ #include <linux/compat.h> -static int ptrace_trapping_sleep_fn(void *flags) -{ - schedule(); - return 0; -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -371,7 +365,7 @@ unlock_creds: out: if (!retval) { wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); proc_ptrace_connector(task, PTRACE_ATTACH); } @@ -462,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) static int ptrace_detach(struct task_struct *child, unsigned int data) { - bool dead = false; - if (!valid_signal(data)) return -EIO; @@ -473,54 +465,38 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) write_lock_irq(&tasklist_lock); /* - * This child can be already killed. Make sure de_thread() or - * our sub-thread doing do_wait() didn't do release_task() yet. + * We rely on ptrace_freeze_traced(). It can't be killed and + * untraced by another thread, it can't be a zombie. */ - if (child->ptrace) { - child->exit_code = data; - dead = __ptrace_detach(current, child); - } + WARN_ON(!child->ptrace || child->exit_state); + /* + * tasklist_lock avoids the race with wait_task_stopped(), see + * the comment in ptrace_resume(). + */ + child->exit_code = data; + __ptrace_detach(current, child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); - if (unlikely(dead)) - release_task(child); return 0; } /* * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. + * for writing. */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); + list_add(&p->ptrace_entry, dead); } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } - - write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) @@ -720,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { + bool need_siglock; + if (!valid_signal(data)) return -EIO; @@ -747,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request, user_disable_single_step(child); } + /* + * Change ->exit_code and ->state under siglock to avoid the race + * with wait_task_stopped() in between; a non-zero ->exit_code will + * wrongly look like another report from tracee. + * + * Note that we need siglock even if ->exit_code == data and/or this + * status was not reported yet, the new status must not be cleared by + * wait_task_stopped() after resume. + * + * If data == 0 we do not care if wait_task_stopped() reports the old + * status and clears the code too; this can't race with the tracee, it + * takes siglock after resume. + */ + need_siglock = data && !thread_group_empty(current); + if (need_siglock) + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); + if (need_siglock) + spin_unlock_irq(&child->sighand->siglock); return 0; } @@ -1100,7 +1096,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, } #if defined CONFIG_COMPAT -#include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data) diff --git a/kernel/range.c b/kernel/range.c index 322ea8e93e4b..82cfc285b046 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; const struct range *r2 = x2; - s64 start1, start2; - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; + if (r1->start < r2->start) + return -1; + if (r1->start > r2->start) + return 1; + return 0; } int clean_sort_range(struct range *range, int az) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..50a808424b06 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,6 +1,7 @@ -obj-y += update.o srcu.o +obj-y += update.o +obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda2726ca45..80adef7d4c3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) void kfree(const void *); +/* + * Reclaim the specified callback, either by invoking it (non-lazy case) + * or freeing it directly (lazy case). Return true if lazy, false otherwise. + */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; @@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); - return 1; + return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); rcu_lock_release(&rcu_callback_map); - return 0; + return false; } } @@ -131,4 +135,12 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +void rcu_early_boot_tests(void); + +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..8dbe27611ec3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -49,11 +49,19 @@ #include <linux/trace_clock.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include <linux/vmalloc.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +torture_param(int, cbflood_inter_holdoff, HZ, + "Holdoff between floods (jiffies)"); +torture_param(int, cbflood_intra_holdoff, 1, + "Holdoff between bursts (jiffies)"); +torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); +torture_param(int, cbflood_n_per_burst, 20000, + "# callbacks per burst in flood"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); static int nrealreaders; +static int ncbflooders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; +static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; @@ -138,6 +148,7 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; +static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; static int rcu_torture_writer_state; @@ -157,9 +168,9 @@ static int rcu_torture_writer_state; #else #define RCUTORTURE_RUNNABLE_INIT 0 #endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); +static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 @@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void) #endif /* #else #ifdef CONFIG_RCU_TRACE */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ +static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ static bool barrier_phase; /* Test phase. */ @@ -233,7 +244,8 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - int (*completed)(void); + unsigned long (*started)(void); + unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -242,7 +254,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - void (*stats)(char *page); + void (*stats)(void); int irq_capable; int can_boost; const char *name; @@ -285,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -345,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } -static int rcu_no_completed(void) +static unsigned long rcu_no_completed(void) { return 0; } @@ -366,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, + .started = rcu_batches_started, + .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -396,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - static void rcu_bh_torture_deferred_free(struct rcu_torture *p) { call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); @@ -412,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -455,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -499,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) srcu_read_unlock(&srcu_ctl, idx); } -static int srcu_torture_completed(void) +static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(&srcu_ctl); } @@ -525,21 +530,21 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static void srcu_torture_stats(char *page) +static void srcu_torture_stats(void) { int cpu; int idx = srcu_ctl.completed & 0x1; - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); + pr_alert("%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; - page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } - sprintf(page, "\n"); + pr_cont("\n"); } static void srcu_torture_synchronize_expedited(void) @@ -553,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .started = NULL, .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -589,7 +595,8 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -601,6 +608,53 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks torture testing. + */ + +static int tasks_torture_read_lock(void) +{ + return 0; +} + +static void tasks_torture_read_unlock(int idx) +{ +} + +static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_ops = { + .ttype = RCU_TASKS_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = tasks_torture_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .deferred_free = rcu_tasks_torture_deferred_free, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .call = call_rcu_tasks, + .cb_barrier = rcu_barrier_tasks, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks" +}; + +#define RCUTORTURE_TASKS_OPS &tasks_ops, + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUTORTURE_TASKS_OPS + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -667,7 +721,7 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -707,6 +761,59 @@ checkwait: stutter_wait("rcu_torture_boost"); return 0; } +static void rcu_torture_cbflood_cb(struct rcu_head *rhp) +{ +} + +/* + * RCU torture callback-flood kthread. Repeatedly induces bursts of calls + * to call_rcu() or analogous, increasing the probability of occurrence + * of callback-overflow corner cases. + */ +static int +rcu_torture_cbflood(void *arg) +{ + int err = 1; + int i; + int j; + struct rcu_head *rhp; + + if (cbflood_n_per_burst > 0 && + cbflood_inter_holdoff > 0 && + cbflood_intra_holdoff > 0 && + cur_ops->call && + cur_ops->cb_barrier) { + rhp = vmalloc(sizeof(*rhp) * + cbflood_n_burst * cbflood_n_per_burst); + err = !rhp; + } + if (err) { + VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); + while (!torture_must_stop()) + schedule_timeout_interruptible(HZ); + return 0; + } + VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); + do { + schedule_timeout_interruptible(cbflood_inter_holdoff); + atomic_long_inc(&n_cbfloods); + WARN_ON(signal_pending(current)); + for (i = 0; i < cbflood_n_burst; i++) { + for (j = 0; j < cbflood_n_per_burst; j++) { + cur_ops->call(&rhp[i * cbflood_n_per_burst + j], + rcu_torture_cbflood_cb); + } + schedule_timeout_interruptible(cbflood_intra_holdoff); + WARN_ON(signal_pending(current)); + } + cur_ops->cb_barrier(); + stutter_wait("rcu_torture_cbflood"); + } while (!torture_must_stop()); + vfree(rhp); + torture_kthread_stopping("rcu_torture_cbflood"); + return 0; +} + /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -746,6 +853,8 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool can_expedite = !rcu_gp_is_expedited(); + int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_sync1 = gp_sync; @@ -758,9 +867,15 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Testing of dynamic grace-period expediting diabled.\n", + torture_type); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) synctype[nsynctypes++] = RTWS_COND_GET; @@ -842,9 +957,26 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); + /* Cycle through nesting levels of rcu_expedite_gp() calls. */ + if (can_expedite && + !(torture_random(&rand) & 0xff & (!!expediting - 1))) { + WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited()); + if (expediting >= 0) + rcu_expedite_gp(); + else + rcu_unexpedite_gp(); + if (++expediting > 3) + expediting = -expediting; + } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); + /* Reset expediting back to unexpedited. */ + if (expediting > 0) + expediting = -expediting; + while (can_expedite && expediting++ < 0) + rcu_unexpedite_gp(); + WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -905,8 +1037,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; - int completed; - int completed_end; + unsigned long started; + unsigned long completed; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -914,7 +1046,10 @@ static void rcu_torture_timer(unsigned long unused) unsigned long long ts; idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -937,14 +1072,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); + started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -963,8 +1100,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { - int completed; - int completed_end; + unsigned long started; + unsigned long completed; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; @@ -983,7 +1120,10 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1004,14 +1144,16 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); + ts, started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1019,7 +1161,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1031,10 +1173,15 @@ rcu_torture_reader(void *arg) } /* - * Create an RCU-torture statistics message in the specified buffer. + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). */ static void -rcu_torture_printk(char *page) +rcu_torture_stats_print(void) { int cpu; int i; @@ -1052,55 +1199,61 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page = torture_onoff_stats(page); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + pr_cont("rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + torture_onoff_stats(); + pr_cont("barrier: %ld/%ld:%ld ", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - page += sprintf(page, "!!! "); + pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - page += sprintf(page, "Reader Pipe: "); + pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); + pr_cont(" %ld", pipesummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); + pr_cont(" %ld", batchsummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); + pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); } - page += sprintf(page, "\n"); + pr_cont("\n"); + if (cur_ops->stats) - cur_ops->stats(page); + cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags; @@ -1109,10 +1262,9 @@ rcu_torture_printk(char *page) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - page += sprintf(page, - "??? Writer stall state %d g%lu c%lu f%#x\n", - rcu_torture_writer_state, - gpnum, completed, flags); + pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); show_rcu_gp_kthreads(); rcutorture_trace_dump(); } @@ -1120,30 +1272,6 @@ rcu_torture_printk(char *page) } /* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - -/* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. */ @@ -1295,7 +1423,8 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - cur_ops->cb_barrier(); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; @@ -1323,6 +1452,9 @@ static int rcu_torture_barrier(void *arg) cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; + pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", + atomic_read(&barrier_cbs_invoked), + n_barrier_cbs); WARN_ON_ONCE(1); } n_barrier_successes++; @@ -1418,7 +1550,7 @@ rcu_torture_cleanup(void) int i; rcutorture_record_test_transition(); - if (torture_cleanup()) { + if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; @@ -1447,6 +1579,8 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); + for (i = 0; i < ncbflooders; i++) + torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); @@ -1468,6 +1602,7 @@ rcu_torture_cleanup(void) "End of test: RCU_HOTPLUG"); else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -1534,9 +1669,10 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, + RCUTORTURE_TASKS_OPS }; - if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ @@ -1693,6 +1829,24 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); + if (cbflood_n_burst > 0) { + /* Create the cbflood threads */ + ncbflooders = (num_online_cpus() + 3) / 4; + cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), + GFP_KERNEL); + if (!cbflood_task) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < ncbflooders; i++) { + firsterr = torture_create_kthread(rcu_torture_cbflood, + NULL, + cbflood_task[i]); + if (firsterr) + goto unwind; + } + } rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556f3fa0..cad76e76b4e7 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; + __this_cpu_inc(sp->per_cpu_ref->seq[idx]); preempt_enable(); return idx; } @@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, } EXPORT_SYMBOL_GPL(call_srcu); -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_srcu() instance now that a - * grace period has elapsed. - */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - static void srcu_advance_batches(struct srcu_struct *sp, int trycount); static void srcu_reschedule(struct srcu_struct *sp); @@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_expedited + __synchronize_srcu(sp, rcu_gp_is_expedited() ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } @@ -546,7 +529,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..069742d61c68 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (!is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ - barrier(); - rcu_dynticks_nesting = newval; -} - /* * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). + * entered that mode. */ void rcu_idle_enter(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_irq_exit(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit idle, so that we are no longer in an extended quiescent state. */ void rcu_idle_exit(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_irq_enter(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_enter); @@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); */ bool notrace __rcu_is_watching(void) { - return rcu_dynticks_nesting; + return true; } EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - -/* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). @@ -203,8 +103,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { RCU_TRACE(reset_cpu_stall_ticks(rcp)); - if (rcp->rcucblist != NULL && - rcp->donetail != rcp->curtail) { + if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; } @@ -217,7 +116,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { unsigned long flags; @@ -231,7 +130,7 @@ void rcu_sched_qs(int cpu) /* * Record an rcu_bh quiescent state. */ -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { unsigned long flags; @@ -247,13 +146,15 @@ void rcu_bh_qs(int cpu) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); + if (user) + rcu_sched_qs(); else if (!in_softirq()) - rcu_bh_qs(cpu); + rcu_bh_qs(); + if (user) + rcu_note_voluntary_context_switch(current); } /* @@ -267,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) unsigned long flags; RCU_TRACE(int cb_count = 0); - /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - !!ACCESS_ONCE(rcp->rcucblist), - need_resched(), - is_idle_task(current), - false)); - return; - } - /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); @@ -355,6 +245,11 @@ static void __call_rcu(struct rcu_head *head, rcp->curtail = &head->next; RCU_TRACE(rcp->qlen++); local_irq_restore(flags); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_sched_qs() */ + resched_cpu(0); + } } /* @@ -378,7 +273,11 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_init(void) +void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); + + rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c56569127..f94e209a10d6 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->ticks_this_gp++; j = jiffies; js = ACCESS_ONCE(rcp->jiffies_stall); - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) + } else if (ULONG_CMP_GE(j, js)) { ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + } } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 625d0b0cd75a..233165da782f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,11 +79,22 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; * the tracing userspace tools to be able to decipher the string * address to the matching string. */ -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +#ifdef CONFIG_TRACING +# define DEFINE_RCU_TPS(sname) \ static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; +# define RCU_STATE_NAME(sname) sname##_varname +#else +# define DEFINE_RCU_TPS(sname) +# define RCU_STATE_NAME(sname) __stringify(sname) +#endif + +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +DEFINE_RCU_TPS(sname) \ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ + .rda = &sname##_data, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ @@ -92,11 +103,9 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = sname##_varname, \ + .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ -}; \ -DEFINE_PER_CPU(struct rcu_data, sname##_data) +} RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -143,23 +152,22 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + +/* Delay in jiffies for grace-period initialization delays. */ +static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) + ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY + : 0; +module_param(gp_init_delay, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -173,6 +181,17 @@ unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; /* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return ACCESS_ONCE(rnp->qsmaskinitnext); +} + +/* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -188,22 +207,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * one since the start of the grace period, this just sets a flag. * The caller must have disabled preemption. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + } } -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_bh"), + __this_cpu_read(rcu_bh_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + } } static DEFINE_PER_CPU(int, rcu_sched_qs_mask); @@ -217,6 +238,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -275,17 +299,33 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled preemption. */ -void rcu_note_context_switch(int cpu) +void rcu_note_context_switch(void) { trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); + rcu_sched_qs(); + rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiescent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desperate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -314,21 +354,57 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); -static int rcu_pending(int cpu); +static int rcu_pending(void); + +/* + * Return the number of RCU batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started(void) +{ + return rcu_state_p->gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started); + +/* + * Return the number of RCU-sched batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_sched(void) +{ + return rcu_sched_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_sched); /* - * Return the number of RCU-sched batches processed thus far for debug & stats. + * Return the number of RCU BH batches started thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_started_bh(void) +{ + return rcu_bh_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_bh); + +/* + * Return the number of RCU batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed(void) +{ + return rcu_state_p->completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU-sched batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* - * Return the number of RCU BH batches processed thus far for debug & stats. + * Return the number of RCU BH batches completed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } @@ -353,6 +429,15 @@ void rcu_bh_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); /* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + +/* * Show the state of the grace-period kthreads. */ void show_rcu_gp_kthreads(void) @@ -426,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum) EXPORT_SYMBOL_GPL(rcutorture_record_progress); /* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - -/* * Does the CPU have callbacks ready to be invoked? */ static int @@ -499,11 +575,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, - bool user) +static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { @@ -520,12 +596,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } - rcu_prepare_for_idle(smp_processor_id()); + rcu_prepare_for_idle(); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while @@ -553,7 +630,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(rdtp, oldval, user); + rcu_eqs_enter_common(oldval, user); } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } @@ -577,7 +654,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -627,8 +704,8 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, true); - rcu_sysidle_enter(rdtp, 1); + rcu_eqs_enter_common(oldval, true); + rcu_sysidle_enter(1); local_irq_restore(flags); } @@ -639,15 +716,17 @@ void rcu_irq_exit(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, - int user) +static void rcu_eqs_exit_common(long long oldval, int user) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -678,7 +757,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(oldval, user); } } @@ -699,7 +778,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -750,47 +829,79 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, true); - rcu_sysidle_exit(rdtp, 1); + rcu_eqs_exit_common(oldval, true); + rcu_sysidle_exit(1); local_irq_restore(flags); } /** * rcu_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) */ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + smp_mb__before_atomic(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* atomic_inc() before later RCU read-side crit sects */ + smp_mb__after_atomic(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + incby = 1; + } + rdtp->dynticks_nmi_nesting += incby; + barrier(); } /** * rcu_nmi_exit - inform RCU of exit from NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. */ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + rdtp->dynticks_nmi_nesting -= 2; return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + rdtp->dynticks_nmi_nesting = 0; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -819,7 +930,7 @@ bool notrace __rcu_is_watching(void) */ bool notrace rcu_is_watching(void) { - int ret; + bool ret; preempt_disable(); ret = __rcu_is_watching(); @@ -862,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; - ret = (rdp->grpmask & rnp->qsmaskinit) || + ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -897,17 +1008,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } /* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1010,13 +1118,26 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); } /* - * Dump stacks of all tasks running on stalled CPUs. This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +{ + unsigned long gpa; + unsigned long j; + + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + if (j - gpa > 2 * HZ) + pr_err("%s kthread starved for %ld jiffies!\n", + rsp->name, j - gpa); +} + +/* + * Dump stacks of all tasks running on stalled CPUs. */ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) { @@ -1035,11 +1156,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1077,30 +1200,35 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs, + rcu_get_root(rsp)->qsmask); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_check_gp_kthread_starvation(rsp); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1125,8 +1253,10 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (!trigger_all_cpu_backtrace()) - dump_stack(); + + rcu_check_gp_kthread_starvation(rsp); + + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) @@ -1196,7 +1326,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1218,20 +1348,30 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's callback list to empty. + * Initialize the specified rcu_data structure's default callback list + * to empty. The default callback list is the one that is not used by + * no-callbacks CPUs. */ -static void init_callback_list(struct rcu_data *rdp) +static void init_default_callback_list(struct rcu_data *rdp) { int i; - if (init_nocb_callback_list(rdp)) - return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; } /* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + if (init_nocb_callback_list(rdp)) + return; + init_default_callback_list(rdp); +} + +/* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that * a CPU can invoke callbacks in a timely fashion even if that CPU has @@ -1305,10 +1445,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * believe that a grace period is in progress, then we must wait * for the one following, which is in "c". Because our request * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. + * need to explicitly start one. We only do the lockless check + * of rnp_root's fields if the current rcu_node structure thinks + * there is no grace period in flight, and because we hold rnp->lock, + * the only possible change is when rnp_root's two fields are + * equal, in which case rnp_root->gpnum might be concurrently + * incremented. But that is OK, as it will just result in our + * doing some extra useless work. */ if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; @@ -1527,7 +1673,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1542,7 +1689,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1551,8 +1698,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1566,7 +1715,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -1583,10 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_gp_init(struct rcu_state *rsp) { + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - rcu_bind_gp_kthread(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); if (!ACCESS_ONCE(rsp->gp_flags)) { @@ -1612,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); - /* Exclude any concurrent CPU-hotplug operations. */ - mutex_lock(&rsp->onoff_mutex); - smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ + /* + * Apply per-leaf buffered online and offline operations to the + * rcu_node tree. Note that this new grace period need not wait + * for subsequent online CPUs, and that quiescent-state forcing + * will handle subsequent offline CPUs. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); + if (rnp->qsmaskinit == rnp->qsmaskinitnext && + !rnp->wait_blkd_tasks) { + /* Nothing to do on this leaf rcu_node structure. */ + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + /* Record old state, apply changes to ->qsmaskinit field. */ + oldmask = rnp->qsmaskinit; + rnp->qsmaskinit = rnp->qsmaskinitnext; + + /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ + if (!oldmask != !rnp->qsmaskinit) { + if (!oldmask) /* First online CPU for this rcu_node. */ + rcu_init_new_rnp(rnp); + else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ + rnp->wait_blkd_tasks = true; + else /* Last offline CPU and can propagate. */ + rcu_cleanup_dead_rnp(rnp); + } + + /* + * If all waited-on tasks from prior grace period are + * done, and if all this rcu_node structure's CPUs are + * still offline, propagate up the rcu_node tree and + * clear ->wait_blkd_tasks. Otherwise, if one of this + * rcu_node structure's CPUs has since come back online, + * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() + * checks for this, so just call it unconditionally). + */ + if (rnp->wait_blkd_tasks && + (!rcu_preempt_has_tasks(rnp) || + rnp->qsmaskinit)) { + rnp->wait_blkd_tasks = false; + rcu_cleanup_dead_rnp(rnp); + } + + raw_spin_unlock_irq(&rnp->lock); + } /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1636,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; - WARN_ON_ONCE(rnp->completed != rsp->completed); - ACCESS_ONCE(rnp->completed) = rsp->completed; + if (WARN_ON_ONCE(rnp->completed != rsp->completed)) + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1645,15 +1841,14 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && - system_state == SYSTEM_RUNNING) - udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ - cond_resched(); + cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) && + gp_init_delay > 0 && + !(rsp->gpnum % (rcu_num_nodes * 10))) + schedule_timeout_uninterruptible(gp_init_delay); } - mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -1667,11 +1862,12 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { - isidle = 1; + isidle = true; maxj = jiffies - ULONG_MAX / 4; } force_qs_rnp(rsp, dyntick_save_progress_counter, @@ -1680,14 +1876,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = 0; + isidle = true; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1704,6 +1901,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1732,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + WARN_ON_ONCE(rnp->qsmask); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) @@ -1739,7 +1939,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1774,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); for (;;) { /* Handle grace-period start. */ @@ -1788,8 +1990,9 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched(); - flush_signals(current); + cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwaitsig")); @@ -1831,11 +2034,13 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched(); + cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ - cond_resched(); - flush_signals(current); + cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswaitsig")); @@ -1931,32 +2136,39 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* * Similar to rcu_report_qs_rdp(), for which it is a helper function. * Allows quiescent states for a group of CPUs to be reported at one go * to the specified rcu_node structure, though all the CPUs in the group - * must be represented by the same rcu_node structure (which need not be - * a leaf rcu_node structure, though it often will be). That structure's - * lock must be held upon entry, and it is released before return. + * must be represented by the same rcu_node structure (which need not be a + * leaf rcu_node structure, though it often will be). The gps parameter + * is the grace-period snapshot, which means that the quiescent states + * are valid only if rnp->gpnum is equal to gps. That structure's lock + * must be held upon entry, and it is released before return. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) + struct rcu_node *rnp, unsigned long gps, unsigned long flags) __releases(rnp->lock) { + unsigned long oldmask = 0; struct rcu_node *rnp_c; /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask)) { + if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { - /* Our bit has already been cleared, so done. */ + /* + * Our bit has already been cleared, or the + * relevant grace period is already over, so done. + */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } + WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -1980,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp_c->qsmask); + oldmask = rnp_c->qsmask; } /* @@ -1992,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, } /* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long gps; + unsigned long mask; + struct rcu_node *rnp_p; + + if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || + rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Only one rcu_node structure in the tree, so don't + * try to report up to its nonexistent parent! + */ + rcu_report_qs_rsp(rsp, flags); + return; + } + + /* Report up the rest of the hierarchy, tracking current ->gpnum. */ + gps = rnp->gpnum; + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); +} + +/* * Record a quiescent state for the specified CPU to that CPU's rcu_data * structure. This must be either called from the specified CPU, or * called when the specified CPU is known to be offline (and when it is @@ -2011,8 +2263,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2021,6 +2275,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2036,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2065,7 +2321,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -2128,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; } - /* Finally, initialize the rcu_data structure's list to empty. */ + /* + * Finally, initialize the rcu_data structure's list to empty and + * disallow further callbacks on this CPU. + */ init_callback_list(rdp); + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } /* @@ -2196,6 +2457,67 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) } /* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + rnp->qsmask &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2205,64 +2527,21 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ - - /* Exclude any attempts to start a new grace period. */ - mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); - init_callback_list(rdp); - /* Disallow further callbacks on this CPU. */ - rdp->nxttail[RCU_NEXT_TAIL] = NULL; - mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2271,6 +2550,14 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } +static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +} + +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -2347,7 +2634,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) -= count; + ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -2378,7 +2665,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); @@ -2396,8 +2683,8 @@ void rcu_check_callbacks(int cpu, int user) * at least not while the corresponding CPU is online. */ - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); + rcu_sched_qs(); + rcu_bh_qs(); } else if (!in_softirq()) { @@ -2408,11 +2695,13 @@ void rcu_check_callbacks(int cpu, int user) * critical section, so note it. */ - rcu_bh_qs(cpu); + rcu_bh_qs(); } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) + rcu_preempt_check_callbacks(); + if (rcu_pending()) invoke_rcu_core(); + if (user) + rcu_note_voluntary_context_switch(current); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -2435,7 +2724,7 @@ static void force_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); + cond_resched_rcu_qs(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -2444,32 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp, return; } if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - continue; + if (rcu_state_p == &rcu_sched_state || + rsp != rcu_state_p || + rcu_preempt_blocked_readers_cgp(rnp)) { + /* + * No point in scanning bits because they + * are all zero. But we might need to + * priority-boost blocked readers. + */ + rcu_initiate_boost(rnp, flags); + /* rcu_initiate_boost() releases rnp->lock */ + continue; + } + if (rnp->parent && + (rnp->parent->qsmask & rnp->grpmask)) { + /* + * Race between grace-period + * initialization and task exiting RCU + * read-side critical section: Report. + */ + rcu_report_unblock_qs_rnp(rsp, rnp, flags); + /* rcu_report_unblock_qs_rnp() rlses ->lock */ + continue; + } } cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) != 0) - *isidle = 0; + if ((rnp->qsmaskinit & bit) == 0) + *isidle = false; /* Pending hotplug. */ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } } if (mask != 0) { - - /* rcu_report_qs_rnp() releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, flags); - continue; + /* Idle/offline CPUs, report (releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + } else { + /* Nothing to do here, so just drop the lock. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2485,14 +2789,14 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rnp = __this_cpu_read(rsp->rda->mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; return; } rnp_old = rnp; @@ -2504,13 +2808,14 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - ACCESS_ONCE(rsp->n_force_qs_lh)++; + rsp->n_force_qs_lh++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2569,7 +2874,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) * Schedule RCU callback invocation. If the specified type of RCU * does not support RCU priority boosting, just do a direct call, * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the + * are running on the current CPU with softirqs disabled, the * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -2601,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (!rcu_is_watching() && cpu_online(smp_processor_id())) + if (!rcu_is_watching()) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ @@ -2662,7 +2967,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ ACCESS_ONCE(head->func) = rcu_leak_callback; @@ -2687,13 +2992,24 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* _call_rcu() is illegal on offline CPU; leak the callback. */ - local_irq_restore(flags); - return; + if (likely(rdp->mynode)) { + /* Post-boot, so this should be for a no-CBs CPU. */ + offline = !__call_rcu_nocb(rdp, head, lazy, flags); + WARN_ON_ONCE(offline); + /* Offline CPU, _call_rcu() illegal, leak callback. */ + local_irq_restore(flags); + return; + } + /* + * Very early boot, before rcu_init(). Initialize if needed + * and then drop through to queue the callback. + */ + BUG_ON(cpu != -1); + WARN_ON_ONCE(!rcu_is_watching()); + if (!likely(rdp->nxtlist)) + init_default_callback_list(rdp); } - ACCESS_ONCE(rdp->qlen)++; + ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) rdp->qlen_lazy++; else @@ -2814,7 +3130,7 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_sched_expedited(); else wait_rcu_gp(call_rcu_sched); @@ -2841,7 +3157,7 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_bh_expedited(); else wait_rcu_gp(call_rcu_bh); @@ -2928,11 +3244,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. - * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and * sync_sched_expedited_done taking on the roles of the halves @@ -2956,6 +3267,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { + cpumask_var_t cm; + bool cma = false; + int cpu; long firstsnap, s, snap; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; @@ -2982,14 +3296,34 @@ void synchronize_sched_expedited(void) */ snap = atomic_long_inc_return(&rsp->expedited_start); firstsnap = snap; - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + return; + } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ + cma = zalloc_cpumask_var(&cm, GFP_KERNEL); + if (cma) { + cpumask_copy(cm, cpu_online_mask); + cpumask_clear_cpu(raw_smp_processor_id(), cm); + for_each_cpu(cpu, cm) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + cpumask_clear_cpu(cpu, cm); + } + if (cpumask_weight(cm) == 0) + goto all_cpus_idle; + } + /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cpu_online_mask, + while (try_stop_cpus(cma ? cm : cpu_online_mask, synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); @@ -3001,6 +3335,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); + free_cpumask_var(cm); return; } @@ -3010,6 +3345,7 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } @@ -3019,6 +3355,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); + free_cpumask_var(cm); return; } @@ -3029,12 +3366,21 @@ void synchronize_sched_expedited(void) * and they started after our first try, so their grace * period works for us. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, use normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); + return; + } snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } atomic_long_inc(&rsp->expedited_stoppedcpus); +all_cpus_idle: + free_cpumask_var(cm); + /* * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still @@ -3079,9 +3425,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3105,7 +3454,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -3126,12 +3476,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -static int rcu_pending(int cpu) +static int rcu_pending(void) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) return 1; return 0; } @@ -3141,7 +3491,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3149,7 +3499,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!rdp->nxtlist) continue; hc = true; @@ -3257,7 +3607,7 @@ static void _rcu_barrier(struct rcu_state *rsp) * ACCESS_ONCE() to prevent the compiler from speculating * the increment to precede the early-exit check. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3282,11 +3632,17 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + smp_mb__before_atomic(); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); @@ -3307,7 +3663,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Increment ->n_barrier_done to prevent duplicate work. */ smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done)++; + ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); smp_mb(); /* Keep increment before caller's subsequent code. */ @@ -3338,6 +3694,28 @@ void rcu_barrier_sched(void) EXPORT_SYMBOL_GPL(rcu_barrier_sched); /* + * Propagate ->qsinitmask bits up the rcu_node tree to account for the + * first CPU in a given leaf rcu_node structure coming online. The caller + * must hold the corresponding leaf rcu_node ->lock with interrrupts + * disabled. + */ +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (rnp == NULL) + return; + raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + rnp->qsmaskinit |= mask; + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + } +} + +/* * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init @@ -3350,9 +3728,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -3376,48 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); - /* Exclude new grace periods. */ - mutex_lock(&rsp->onoff_mutex); - /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (!rdp->nxtlist) + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* Add CPU to rcu_node bitmasks. */ + /* + * Add CPU to leaf rcu_node pending-online bitmask. Any needed + * propagation up the rcu_node tree will happen at the beginning + * of the next grace period. + */ rnp = rdp->mynode; mask = rdp->grpmask; - do { - /* Exclude any attempts to start a new GP on small systems. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit |= mask; - mask = rnp->grpmask; - if (rnp == rdp->mynode) { - /* - * If there is a grace period in progress, we will - * set up to wait for it next time we run the - * RCU core code. - */ - rdp->gpnum = rnp->completed; - rdp->completed = rnp->completed; - rdp->passed_quiesce = 0; - rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - } - raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ - rnp = rnp->parent; - } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - local_irq_restore(flags); - - mutex_unlock(&rsp->onoff_mutex); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rnp->qsmaskinitnext |= mask; + rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ + rdp->completed = rnp->completed; + rdp->passed_quiesce = false; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->qs_pending = false; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } static void rcu_prepare_cpu(int cpu) @@ -3431,20 +3795,20 @@ static void rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: rcu_prepare_cpu(cpu); rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); break; case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -3458,17 +3822,23 @@ static int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; + case CPU_DYING_IDLE: + for_each_rcu_flavor(rsp) { + rcu_cleanup_dying_idle_cpu(cpu, rsp); + } + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) + for_each_rcu_flavor(rsp) { rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } break; default: break; } - trace_rcu_utilization(TPS("End CPU hotplug")); return NOTIFY_OK; } @@ -3479,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self, case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedited = 1; + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - rcu_expedited = 0; + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_unexpedite_gp(); break; default: break; @@ -3492,24 +3863,44 @@ static int rcu_pm_notify(struct notifier_block *self, } /* - * Spawn the kthread that handles this RCU flavor's grace periods. + * Spawn the kthreads that handle each RCU flavor's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; + int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct rcu_state *rsp; + struct sched_param sp; struct task_struct *t; + /* Force priority into range. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + if (kthread_prio != kthread_prio_in) + pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", + kthread_prio, kthread_prio_in); + + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_spawn_nocb_kthreads(rsp); } + rcu_spawn_nocb_kthreads(); + rcu_spawn_boost_kthreads(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -3533,30 +3924,26 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. */ -#ifdef CONFIG_RCU_FANOUT_EXACT static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; -} -#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int ccur; - int cprv; - int i; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; + if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } } } -#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ /* * Helper function for rcu_init() that initializes one rcu_state structure. @@ -3564,14 +3951,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = { + "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static const char * const fqs[] = { + "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ static u8 fl_mask = 0x1; int cpustride = 1; int i; @@ -3630,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } - rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { @@ -3723,6 +4111,8 @@ void __init rcu_init(void) { int cpu; + rcu_early_boot_tests(); + rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(&rcu_bh_state, &rcu_bh_data); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79c5b7d..a69d3dab2ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> -#include <linux/irq_work.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -139,15 +138,23 @@ struct rcu_node { unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ - /* complete (only for TREE_PREEMPT_RCU). */ + /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ + /* Initialized from ->qsmaskinitnext at the */ + /* beginning of each grace period. */ + unsigned long qsmaskinitnext; + /* Online CPUs for next grace period. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ + bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ + /* exit RCU read-side critical sections */ + /* before propagating offline up the */ + /* rcu_node tree? */ struct rcu_node *parent; struct list_head blkd_tasks; /* Tasks blocked in RCU read-side critical */ @@ -172,6 +179,9 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ + struct rt_mutex boost_mtx; + /* Used only for the priority-boosting */ + /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ struct task_struct *boost_kthread_task; @@ -249,9 +259,12 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -332,13 +345,25 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ + struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ + struct rcu_head **nocb_follower_tail; wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; - bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + + /* The following fields are used by the leader, hence own cacheline. */ + struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; + /* CBs waiting for GP. */ + struct rcu_head **nocb_gp_tail; + bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ + struct rcu_data *nocb_next_follower; + /* Next follower in wakeup chain. */ + + /* The following fields are used by the follower, hence new cachline. */ + struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; + /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -357,6 +382,11 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +/* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOGP_WAKE_NOT 0 +#define RCU_NOGP_WAKE 1 +#define RCU_NOGP_WAKE_FORCE 2 + #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ /* and jiffies_till_next_fqs. */ @@ -426,8 +456,6 @@ struct rcu_state { long qlen; /* Total number of callbacks. */ /* End of fields guarded by orphan_lock. */ - struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ @@ -457,10 +485,14 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -483,13 +515,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -499,10 +524,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -515,27 +540,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); +static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -546,15 +560,18 @@ static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ +static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); +static void rcu_cleanup_after_idle(void); +static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -563,14 +580,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags); -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_spawn_all_nocb_kthreads(int cpu); +static void __init rcu_spawn_nocb_kthreads(void); +#ifdef CONFIG_RCU_NOCB_CPU +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_enter(int irq); +static void rcu_sysidle_exit(int irq); static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); @@ -579,22 +600,21 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static void rcu_dynticks_task_enter(void); +static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb186b8..8c0ec0f5a027 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -30,19 +30,25 @@ #include <linux/smpboot.h> #include "../time/tick-internal.h" -#define RCU_KTHREAD_PRIO 1 - #ifdef CONFIG_RCU_BOOST -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else -#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO -#endif + +#include "../locking/rtmutex_common.h" + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static char __initdata nocb_buf[NR_CPUS * 5]; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* @@ -52,73 +58,43 @@ static char __initdata nocb_buf[NR_CPUS * 5]; */ static void __init rcu_bootup_announce_oddness(void) { -#ifdef CONFIG_RCU_TRACE - pr_info("\tRCU debugfs-based tracing is enabled.\n"); -#endif -#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - CONFIG_RCU_FANOUT); -#endif -#ifdef CONFIG_RCU_FANOUT_EXACT - pr_info("\tHierarchical RCU autobalancing is disabled.\n"); -#endif -#ifdef CONFIG_RCU_FAST_NO_HZ - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); -#endif -#ifdef CONFIG_PROVE_RCU - pr_info("\tRCU lockdep checking is enabled.\n"); -#endif -#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - pr_info("\tRCU torture testing starts during boot.\n"); -#endif -#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); -#endif -#if defined(CONFIG_RCU_CPU_STALL_INFO) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); -#endif -#if NUM_RCU_LVL_4 != 0 - pr_info("\tFour-level hierarchy is enabled.\n"); -#endif + if (IS_ENABLED(CONFIG_RCU_TRACE)) + pr_info("\tRCU debugfs-based tracing is enabled.\n"); + if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || + (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + CONFIG_RCU_FANOUT); + if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); + if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) + pr_info("\tRCU torture testing starts during boot.\n"); + if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) + pr_info("\tAdditional per-CPU info printed with stalls.\n"); + if (NUM_RCU_LVL_4 != 0) + pr_info("\tFour-level hierarchy is enabled.\n"); + if (CONFIG_RCU_FANOUT_LEAF != 16) + pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", + CONFIG_RCU_FANOUT_LEAF); if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); -#ifdef CONFIG_RCU_NOCB_CPU -#ifndef CONFIG_RCU_NOCB_CPU_NONE - if (!have_rcu_nocb_mask) { - zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); - have_rcu_nocb_mask = true; - } -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - if (have_rcu_nocb_mask) { - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - } -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + if (IS_ENABLED(CONFIG_RCU_BOOST)) + pr_info("\tRCU kthread priority: %d.\n", kthread_prio); } -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -130,42 +106,24 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + * As with the other rcu_*_qs() functions, callers to this function + * must disable preemption. + */ +static void rcu_preempt_qs(void) +{ + if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_preempt"), + __this_cpu_read(rcu_preempt_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + current->rcu_read_unlock_special.b.need_qs = false; + } } /* @@ -181,7 +139,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -189,14 +147,14 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_node *rnp; if (t->rcu_read_lock_nesting > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; /* @@ -217,7 +175,7 @@ static void rcu_preempt_note_context_switch(int cpu) * But first, note that the current CPU must still be * on line! */ - WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); @@ -238,7 +196,7 @@ static void rcu_preempt_note_context_switch(int cpu) : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special) { + t->rcu_read_unlock_special.s) { /* * Complete exit from RCU read-side critical section on @@ -256,9 +214,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - local_irq_save(flags); - rcu_preempt_qs(cpu); - local_irq_restore(flags); + rcu_preempt_qs(); } /* @@ -272,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) } /* - * Record a quiescent state for all tasks that were previously queued - * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with - * irqs disabled, and this lock is released upon return, but irqs remain - * disabled. - */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - struct rcu_node *rnp_p; - - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; /* Still need more quiescent states! */ - } - - rnp_p = rnp->parent; - if (rnp_p == NULL) { - /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. - */ - rcu_report_qs_rsp(&rcu_preempt_state, flags); - return; - } - - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); -} - -/* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. */ @@ -324,22 +243,31 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, } /* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; - int empty_exp; - int empty_exp_now; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; + bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; - int special; + union rcu_special special; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -349,26 +277,34 @@ void rcu_read_unlock_special(struct task_struct *t) /* * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. + * let it know that we have done so. Because irqs are disabled, + * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) { - rcu_preempt_qs(smp_processor_id()); - if (!t->rcu_read_unlock_special) { + if (special.b.need_qs) { + rcu_preempt_qs(); + t->rcu_read_unlock_special.b.need_qs = false; + if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } } /* Hardware IRQ handlers cannot block, complain if they get here. */ - if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { + if (in_irq() || in_serving_softirq()) { + lockdep_rcu_suspicious(__FILE__, __LINE__, + "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); + pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + t->rcu_read_unlock_special.s, + t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; } /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + if (special.b.blocked) { + t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The @@ -383,7 +319,7 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -398,11 +334,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } + /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -412,7 +345,7 @@ void rcu_read_unlock_special(struct task_struct *t) * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, @@ -420,15 +353,16 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rnp, flags); + rcu_report_unblock_qs_rnp(&rcu_preempt_state, + rnp, flags); } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); } #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (rbmp) - rt_mutex_unlock(rbmp); + if (drop_boost_mutex) + rt_mutex_unlock(&rnp->boost_mtx); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -442,8 +376,6 @@ void rcu_read_unlock_special(struct task_struct *t) } } -#ifdef CONFIG_RCU_CPU_STALL_VERBOSE - /* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. @@ -478,14 +410,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - #ifdef CONFIG_RCU_CPU_STALL_INFO static void rcu_print_task_stall_begin(struct rcu_node *rnp) @@ -546,101 +470,11 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -648,17 +482,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); + rcu_preempt_qs(); return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; + __this_cpu_read(rcu_preempt_data.qs_pending) && + !__this_cpu_read(rcu_preempt_data.passed_quiesce)) + t->rcu_read_unlock_special.b.need_qs = true; } #ifdef CONFIG_RCU_BOOST @@ -701,7 +536,7 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else wait_rcu_gp(call_rcu); @@ -746,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Most callers will set the "wake" flag, but the task initiating the - * expedited grace period need not wake itself. - * * Caller must hold sync_rcu_preempt_exp_mutex. */ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, @@ -783,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure. If there are no such - * tasks, report it up the rcu_node hierarchy. + * grace period for the specified rcu_node structure, phase 1. If there + * are such tasks, set the ->expmask bits up the rcu_node tree and also + * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 + * that work is needed here. * - * Caller must hold sync_rcu_preempt_exp_mutex and must exclude - * CPU hotplug operations. + * Caller must hold sync_rcu_preempt_exp_mutex. */ static void -sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) { unsigned long flags; - int must_wait = 0; + unsigned long mask; + struct rcu_node *rnp_up; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + WARN_ON_ONCE(rnp->expmask); + WARN_ON_ONCE(rnp->exp_tasks); + if (!rcu_preempt_has_tasks(rnp)) { + /* No blocked tasks, nothing to do. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else { + return; + } + /* Call for Phase 2 and propagate ->expmask bits up the tree. */ + rnp->expmask = 1; + rnp_up = rnp; + while (rnp_up->parent) { + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + if (rnp_up->expmask & mask) + break; + raw_spin_lock(&rnp_up->lock); /* irqs already off */ + smp_mb__after_unlock_lock(); + rnp_up->expmask |= mask; + raw_spin_unlock(&rnp_up->lock); /* irqs still off */ + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure, phase 2. If the + * leaf rcu_node structure has its ->expmask field set, check for tasks. + * If there are some, clear ->expmask and set ->exp_tasks accordingly, + * then initiate RCU priority boosting. Otherwise, clear ->expmask and + * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, + * enabling rcu_read_unlock_special() to do the bit-clearing. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void +sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (!rnp->expmask) { + /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + /* Phase 1 is over. */ + rnp->expmask = 0; + + /* + * If there are still blocked tasks, set up ->exp_tasks so that + * rcu_read_unlock_special() will wake us and then boost them. + */ + if (rcu_preempt_has_tasks(rnp)) { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - must_wait = 1; + return; } - if (!must_wait) - rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ + + /* No longer any blocked tasks, so undo bit setting. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_report_exp_rnp(rsp, rnp, false); } /** @@ -819,15 +707,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) * In fact, if you are using synchronize_rcu_expedited() in a loop, * please restructure your code to batch your updates, and then Use a * single synchronize_rcu() instead. - * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { - unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; unsigned long snap; @@ -845,7 +727,11 @@ void synchronize_rcu_expedited(void) * being boosted. This simplifies the process of moving tasks * from leaf to root rcu_node structures. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU-hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu); + return; + } /* * Acquire lock, falling back to synchronize_rcu() if too many @@ -874,19 +760,16 @@ void synchronize_rcu_expedited(void) /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - /* Initialize ->expmask for all non-leaf rcu_node structures. */ - rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - - /* Snapshot current state of ->blkd_tasks lists. */ + /* + * Snapshot current state of ->blkd_tasks lists into ->expmask. + * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() + * to start clearing them. Doing this in one phase leads to + * strange races between setting and clearing bits, so just say "no"! + */ rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init(rsp, rnp); - if (NUM_RCU_NODES > 1) - sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + sync_rcu_preempt_exp_init1(rsp, rnp); + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init2(rsp, rnp); put_online_cpus(); @@ -897,7 +780,8 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count)++; + ACCESS_ONCE(sync_rcu_preempt_exp_count) = + sync_rcu_preempt_exp_count + 1; unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: @@ -941,11 +825,11 @@ void exit_rcu(void) return; t->rcu_read_lock_nesting = 1; barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); } -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *rcu_state_p = &rcu_sched_state; @@ -959,19 +843,10 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { } @@ -984,16 +859,14 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + return false; } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. @@ -1021,28 +894,11 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { } @@ -1056,20 +912,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). @@ -1095,7 +937,7 @@ void exit_rcu(void) { } -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1105,7 +947,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; @@ -1149,11 +991,11 @@ static void rcu_wake_cond(struct task_struct *t, int status) static int rcu_boost(struct rcu_node *rnp) { unsigned long flags; - struct rt_mutex mtx; struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1200,11 +1042,11 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; + rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + /* Lock only for side effect: boosts task t's priority. */ + rt_mutex_lock(&rnp->boost_mtx); + rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; @@ -1256,6 +1098,7 @@ static int rcu_boost_kthread(void *arg) * about it going away. */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { struct task_struct *t; @@ -1323,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; unsigned long flags; @@ -1333,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (&rcu_preempt_state != rsp) return 0; - if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) return 0; rsp->boost = 1; @@ -1347,7 +1190,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_BOOST_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1364,7 +1207,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } @@ -1426,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu) static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rnp->qsmaskinit; + unsigned long mask = rcu_rnp_online_cpus(rnp); cpumask_var_t cm; int cpu; @@ -1437,12 +1280,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1457,26 +1296,19 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = { }; /* - * Spawn all kthreads -- called as soon as the scheduler is running. + * Spawn boost kthreads -- called as soon as the scheduler is running. */ -static int __init rcu_spawn_kthreads(void) +static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; int cpu; - rcu_scheduler_fully_active = 1; for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } - return 0; + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } -early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { @@ -1491,6 +1323,7 @@ static void rcu_prepare_kthreads(int cpu) #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1513,12 +1346,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static int __init rcu_scheduler_really_started(void) +static void __init rcu_spawn_boost_kthreads(void) { - rcu_scheduler_fully_active = 1; - return 0; } -early_initcall(rcu_scheduler_really_started); static void rcu_prepare_kthreads(int cpu) { @@ -1538,10 +1368,10 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu, NULL); + return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1549,7 +1379,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up * after it. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { } @@ -1557,7 +1387,7 @@ static void rcu_cleanup_after_idle(int cpu) * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { } @@ -1619,7 +1449,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* Exit early if we advanced recently. */ if (jiffies == rdtp->last_advance_all) - return 0; + return false; rdtp->last_advance_all = jiffies; for_each_rcu_flavor(rsp) { @@ -1631,7 +1461,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); @@ -1650,15 +1481,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *dj) +int rcu_needs_cpu(unsigned long *dj) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { *dj = ULONG_MAX; return 0; } @@ -1692,12 +1523,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) * * The caller must have disabled interrupts. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; int tne; @@ -1705,7 +1536,7 @@ static void rcu_prepare_for_idle(int cpu) /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu, NULL)) + if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1714,7 +1545,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* @@ -1738,7 +1569,7 @@ static void rcu_prepare_for_idle(int cpu) return; rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; @@ -1757,10 +1588,10 @@ static void rcu_prepare_for_idle(int cpu) * any grace periods that elapsed while the CPU was idle, and if any * callbacks are now ready to invoke, initiate invocation. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1842,7 +1673,7 @@ static int rcu_oom_notify(struct notifier_block *self, get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); + cond_resched_rcu_qs(); } put_online_cpus(); @@ -1924,11 +1755,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } @@ -2060,6 +1892,68 @@ bool rcu_is_nocb_cpu(int cpu) #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* + * Kick the leader kthread for this NOCB group. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + struct rcu_data *rdp_leader = rdp->nocb_leader; + + if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) + return; + if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { + /* Prior smp_mb__after_atomic() orders against prior enqueue. */ + ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; + wake_up(&rdp_leader->nocb_wq); + } +} + +/* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU + struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ + + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); + +#ifdef CONFIG_PROVE_RCU + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && + rcu_scheduler_fully_active) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } +#endif /* #ifdef CONFIG_PROVE_RCU */ + + return !!ret; +} + +/* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy @@ -2078,10 +1972,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); @@ -2093,19 +1989,28 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { - wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + /* ... if queue was empty ... */ + wake_nocb_leader(rdp, false); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = true; + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { - wake_up_process(t); /* ... or if many callbacks queued. */ + /* ... or if many callbacks queued. */ + if (!irqs_disabled_flags(flags)) { + wake_nocb_leader(rdp, true); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvf")); + } else { + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvfIsDeferred")); + } rdp->qlen_last_fqs_check = LONG_MAX / 2; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } @@ -2126,7 +2031,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, { if (!rcu_is_nocb_cpu(rdp->cpu)) - return 0; + return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, @@ -2137,7 +2042,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_callback(rdp->rsp->name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); - return 1; + + /* + * If called from an extended quiescent state with interrupts + * disabled, invoke the RCU core in order to allow the idle-entry + * deferred-wakeup check to function. + */ + if (irqs_disabled_flags(flags) && + !rcu_is_watching() && + cpu_online(smp_processor_id())) + invoke_rcu_core(); + + return true; } /* @@ -2153,7 +2069,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) - return 0; + return false; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2172,7 +2088,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; } - return 1; + return true; } /* @@ -2205,7 +2121,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); if (likely(d)) break; - flush_signals(current); + WARN_ON(signal_pending(current)); trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); @@ -2213,13 +2129,145 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) } /* + * Leaders come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_leader_wait(struct rcu_data *my_rdp) +{ + bool firsttime = true; + bool gotcbs; + struct rcu_data *rdp; + struct rcu_head **tail; + +wait_again: + + /* Wait for callbacks to appear. */ + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + wait_event_interruptible(my_rdp->nocb_wq, + !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); + /* Memory barrier handled by smp_mb() calls below and repoll. */ + } else if (firsttime) { + firsttime = false; /* Don't drown trace log with "Poll"! */ + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + } + + /* + * Each pass through the following loop checks a follower for CBs. + * We are our own first follower. Any CBs found are moved to + * nocb_gp_head, where they await a grace period. + */ + gotcbs = false; + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); + if (!rdp->nocb_gp_head) + continue; /* No CBs here, try next follower. */ + + /* Move callbacks to wait-for-GP list, which is empty. */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + gotcbs = true; + } + + /* + * If there were no callbacks, sleep a bit, rescan after a + * memory barrier, and go retry. + */ + if (unlikely(!gotcbs)) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + "WokeEmpty"); + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(1); + + /* Rescan in case we were a victim of memory ordering. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) + if (ACCESS_ONCE(rdp->nocb_head)) { + /* Found CB, so short-circuit next wait. */ + my_rdp->nocb_leader_sleep = false; + break; + } + goto wait_again; + } + + /* Wait for one grace period. */ + rcu_nocb_wait_gp(my_rdp); + + /* + * We left ->nocb_leader_sleep unset to reduce cache thrashing. + * We set it now, but recheck for new callbacks while + * traversing our follower list. + */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ + + /* Each pass through the following loop wakes a follower, if needed. */ + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + if (ACCESS_ONCE(rdp->nocb_head)) + my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + if (!rdp->nocb_gp_head) + continue; /* No CBs, so no need to wake follower. */ + + /* Append callbacks to follower's "done" list. */ + tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + *tail = rdp->nocb_gp_head; + smp_mb__after_atomic(); /* Store *tail before wakeup. */ + if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { + /* + * List was empty, wake up the follower. + * Memory barriers supplied by atomic_long_add(). + */ + wake_up(&rdp->nocb_wq); + } + } + + /* If we (the leader) don't have CBs, go wait some more. */ + if (!my_rdp->nocb_follower_head) + goto wait_again; +} + +/* + * Followers come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_follower_wait(struct rcu_data *rdp) +{ + bool firsttime = true; + + for (;;) { + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "FollowerSleep"); + wait_event_interruptible(rdp->nocb_wq, + ACCESS_ONCE(rdp->nocb_follower_head)); + } else if (firsttime) { + /* Don't drown trace log with "Poll"! */ + firsttime = false; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); + } + if (smp_load_acquire(&rdp->nocb_follower_head)) { + /* ^^^ Ensure CB invocation follows _head test. */ + return; + } + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + "WokeEmpty"); + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(1); + } +} + +/* * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. + * callbacks queued by the corresponding no-CBs CPU, however, there is + * an optional leader-follower relationship so that the grace-period + * kthreads don't have to do quite so many wakeups. */ static int rcu_nocb_kthread(void *arg) { int c, cl; - bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2227,44 +2275,23 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { - /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); - wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - /* Memory barrier provide by xchg() below. */ - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } - list = ACCESS_ONCE(rdp->nocb_head); - if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); - schedule_timeout_interruptible(1); - flush_signals(current); - continue; - } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); + /* Wait for callbacks. */ + if (rdp->nocb_leader == rdp) + nocb_leader_wait(rdp); + else + nocb_follower_wait(rdp); - /* - * Extract queued callbacks, update counts, and wait - * for a grace period to elapse. - */ - ACCESS_ONCE(rdp->nocb_head) = NULL; - tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - c = atomic_long_xchg(&rdp->nocb_q_count, 0); - cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); - ACCESS_ONCE(rdp->nocb_p_count) += c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - rcu_nocb_wait_gp(rdp); + /* Pull the ready-to-invoke callbacks onto local list. */ + list = ACCESS_ONCE(rdp->nocb_follower_head); + BUG_ON(!list); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + ACCESS_ONCE(rdp->nocb_follower_head) = NULL; + tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2286,15 +2313,16 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) -= c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; } /* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return ACCESS_ONCE(rdp->nocb_defer_wakeup); } @@ -2302,11 +2330,69 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) /* Do a deferred wakeup of rcu_nocb_kthread(). */ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { + int ndw; + if (!rcu_nocb_need_deferred_wakeup(rdp)) return; - ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_up(&rdp->nocb_wq); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); + ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); + ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; + wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); +} + +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = true; + struct rcu_state *rsp; + +#ifdef CONFIG_RCU_NOCB_CPU_NONE + need_rcu_nocb_mask = false; +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + have_rcu_nocb_mask = true; + } + if (!have_rcu_nocb_mask) + return; + +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tOffload RCU callbacks from CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tOffload RCU callbacks from all CPUs\n"); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_rcu_flavor(rsp) { + for_each_cpu(cpu, rcu_nocb_mask) + init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); + rcu_organize_nocb_kthreads(rsp); + } } /* Initialize per-rcu_data variables for no-CBs CPUs. */ @@ -2314,38 +2400,155 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; init_waitqueue_head(&rdp->nocb_wq); + rdp->nocb_follower_tail = &rdp->nocb_follower_head; } -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are + * brought online out of order, this can require re-organizing the + * leader-follower relationships. + */ +static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) { - int cpu; struct rcu_data *rdp; + struct rcu_data *rdp_last; + struct rcu_data *rdp_old_leader; + struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); struct task_struct *t; - if (rcu_nocb_mask == NULL) + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) return; + + /* If we didn't spawn the leader first, reorganize! */ + rdp_old_leader = rdp_spawn->nocb_leader; + if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { + rdp_last = NULL; + rdp = rdp_old_leader; + do { + rdp->nocb_leader = rdp_spawn; + if (rdp_last && rdp != rdp_spawn) + rdp_last->nocb_next_follower = rdp; + if (rdp == rdp_spawn) { + rdp = rdp->nocb_next_follower; + } else { + rdp_last = rdp; + rdp = rdp->nocb_next_follower; + rdp_last->nocb_next_follower = NULL; + } + } while (rdp); + rdp_spawn->nocb_next_follower = rdp_old_leader; + } + + /* Spawn the kthread for this CPU and RCU flavor. */ + t = kthread_run(rcu_nocb_kthread, rdp_spawn, + "rcuo%c/%d", rsp->abbr, cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthreads, spawn them. + */ +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ + struct rcu_state *rsp; + + if (rcu_scheduler_fully_active) + for_each_rcu_flavor(rsp) + rcu_spawn_one_nocb_kthread(rsp, cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_all_nocb_kthreads(cpu); +} + +/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_leader_stride = -1; +module_param(rcu_nocb_leader_stride, int, 0444); + +/* + * Initialize leader-follower relationships for all no-CBs CPU. + */ +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) +{ + int cpu; + int ls = rcu_nocb_leader_stride; + int nl = 0; /* Next leader. */ + struct rcu_data *rdp; + struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; + + if (!have_rcu_nocb_mask) + return; + if (ls == -1) { + ls = int_sqrt(nr_cpu_ids); + rcu_nocb_leader_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure and + * spawns one rcu_nocb_kthread(). + */ for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, - "rcuo%c/%d", rsp->abbr, cpu); - BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp->nocb_kthread) = t; + if (rdp->cpu >= nl) { + /* New leader, set up for followers & next leader. */ + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_leader = rdp; + rdp_leader = rdp; + } else { + /* Another follower, link to previous leader. */ + rdp->nocb_leader = rdp_leader; + rdp_prev->nocb_next_follower = rdp; + } + rdp_prev = rdp; } } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ static bool init_nocb_callback_list(struct rcu_data *rdp) { - if (rcu_nocb_mask == NULL || - !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return false; + + /* If there are early-boot callbacks, move them to nocb lists. */ + if (rdp->nxtlist) { + rdp->nocb_head = rdp->nxtlist; + rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; + atomic_long_set(&rdp->nocb_q_count, rdp->qlen); + atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); + rdp->nxtlist = NULL; + rdp->qlen = 0; + rdp->qlen_lazy = 0; + } rdp->nxttail[RCU_NEXT_TAIL] = NULL; return true; } #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } @@ -2361,21 +2564,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags) { - return 0; + return false; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - return 0; + return false; } static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return false; } @@ -2384,7 +2587,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) { } @@ -2415,16 +2622,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE -/* - * Define RCU flavor that holds sysidle state. This needs to be the - * most active flavor of RCU. - */ -#ifdef CONFIG_PREEMPT_RCU -static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; -#else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ @@ -2438,9 +2635,14 @@ static int full_sysidle_state; /* Current system-idle state. */ * to detect full-system idle states, not RCU quiescent states and grace * periods. The caller must have disabled interrupts. */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { unsigned long j; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; /* Adjust nesting, check for fully idle. */ if (irq) { @@ -2505,8 +2707,14 @@ void rcu_sysidle_force_exit(void) * usermode execution does -not- count as idle here! The caller must * have disabled interrupts. */ -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + /* Adjust nesting, check for already non-idle. */ if (irq) { rdtp->dynticks_idle_nesting++; @@ -2552,7 +2760,8 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) /* * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts. + * does not count as idle. The caller must have disabled interrupts, + * and must be running on tick_do_timer_cpu. */ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) @@ -2561,16 +2770,20 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long j; struct rcu_dynticks *rdtp = rdp->dynticks; + /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ + if (!tick_nohz_full_enabled()) + return; + /* * If some other CPU has already reported non-idle, if this is * not the flavor of RCU that tracks sysidle state, or if this * is an offline or the timekeeping CPU, nothing to do. */ - if (!*isidle || rdp->rsp != rcu_sysidle_state || + if (!*isidle || rdp->rsp != rcu_state_p || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; - if (rcu_gp_in_progress(rdp->rsp)) - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + /* Verify affinity of current kthread. */ + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); /* Pick up current idle and NMI-nesting counter and check. */ cur = atomic_read(&rdtp->dynticks_idle); @@ -2592,7 +2805,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, */ static bool is_sysidle_rcu_state(struct rcu_state *rsp) { - return rsp == rcu_sysidle_state; + return rsp == rcu_state_p; } /* @@ -2670,7 +2883,7 @@ static void rcu_sysidle_cancel(void) static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, unsigned long maxj, bool gpkt) { - if (rsp != rcu_sysidle_state) + if (rsp != rcu_state_p) return; /* Wrong flavor, ignore. */ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) return; /* Running state machine from timekeeping CPU. */ @@ -2687,6 +2900,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + rcu_sysidle_report(rsp, isidle, maxj, true); } @@ -2713,7 +2930,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) /* * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. + * The caller must have disabled interrupts. This is not intended to be + * called unless tick_nohz_full_enabled(). */ bool rcu_sys_is_idle(void) { @@ -2739,13 +2957,12 @@ bool rcu_sys_is_idle(void) /* Scan all the CPUs looking for nonidle CPUs. */ for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rdp = per_cpu_ptr(rcu_state_p->rda, cpu); rcu_sysidle_check_cpu(rdp, &isidle, &maxj); if (!isidle) break; } - rcu_sysidle_report(rcu_sysidle_state, - isidle, maxj, false); + rcu_sysidle_report(rcu_state_p, isidle, maxj, false); oldrss = rss; rss = ACCESS_ONCE(full_sysidle_state); } @@ -2772,7 +2989,7 @@ bool rcu_sys_is_idle(void) * provided by the memory allocator. */ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_sysidle_state) && + !rcu_gp_in_progress(rcu_state_p) && !rsh.inuse && xchg(&rsh.inuse, 1) == 0) call_rcu(&rsh.rh, rcu_sysidle_cb); return false; @@ -2788,11 +3005,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { } -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { } @@ -2843,12 +3060,31 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { -#ifdef CONFIG_NO_HZ_FULL - int cpu = ACCESS_ONCE(tick_do_timer_cpu); + int __maybe_unused cpu; - if (cpu < 0 || cpu >= nr_cpu_ids) + if (!tick_nohz_full_enabled()) return; - if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + cpu = tick_do_timer_cpu; + if (cpu >= 0 && cpu < nr_cpu_ids) set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +} + +/* Record the current task on dyntick-idle entry. */ +static void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e1beeb..f92361efd0f5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -279,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, + seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, ".G"[rnp->gp_tasks != NULL], ".E"[rnp->exp_tasks != NULL], ".T"[!list_empty(&rnp->blkd_tasks)], diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bc7883570530..1f133350da01 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -47,6 +47,8 @@ #include <linux/hardirq.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/kthread.h> +#include <linux/tick.h> #define CREATE_TRACE_POINTS @@ -60,6 +62,63 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#ifndef CONFIG_TINY_RCU + +static atomic_t rcu_expedited_nesting = + ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); + +/* + * Should normal grace-period primitives be expedited? Intended for + * use within RCU. Note that this function takes the rcu_expedited + * sysfs/boot variable into account as well as the rcu_expedite_gp() + * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() + * returns false is a -really- bad idea. + */ +bool rcu_gp_is_expedited(void) +{ + return rcu_expedited || atomic_read(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); + +/** + * rcu_expedite_gp - Expedite future RCU grace periods + * + * After a call to this function, future calls to synchronize_rcu() and + * friends act as the corresponding synchronize_rcu_expedited() function + * had instead been called. + */ +void rcu_expedite_gp(void) +{ + atomic_inc(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_expedite_gp); + +/** + * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation + * + * Undo a prior call to rcu_expedite_gp(). If all prior calls to + * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), + * and if the rcu_expedited sysfs/boot parameter is not set, then all + * subsequent calls to synchronize_rcu() and friends will return to + * their normal non-expedited behavior. + */ +void rcu_unexpedite_gp(void) +{ + atomic_dec(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); + +#endif /* #ifndef CONFIG_TINY_RCU */ + +/* + * Inform RCU of the end of the in-kernel boot sequence. + */ +void rcu_end_inkernel_boot(void) +{ + if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) + rcu_unexpedite_gp(); +} + #ifdef CONFIG_PREEMPT_RCU /* @@ -90,11 +149,8 @@ void __rcu_read_unlock(void) } else { barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY - udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; @@ -140,6 +196,38 @@ int notrace debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** + * rcu_read_lock_held() - might we be in RCU read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, + * this assumes we are in an RCU read-side critical section unless it can + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. + * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. + */ +int rcu_read_lock_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + return lock_is_held(&rcu_lock_map); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_held); + +/** * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the @@ -168,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_rcu() instance now that a - * grace period has elapsed. +/** + * wakeme_after_rcu() - Callback function to awaken a task after grace period + * @head: Pointer to rcu_head member within rcu_synchronize structure + * + * Awaken the corresponding task now that a grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -275,7 +360,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -350,3 +435,397 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context switch, + * user-space execution, and idle. As such, grace periods can take one good + * long time. There are no read-side primitives similar to rcu_read_lock() + * and rcu_read_unlock() because this implementation is intended to get + * the system into a safe state for some of the manipulations involved in + * tracing and the like. Finally, this implementation does not support + * high call_rcu_tasks() rates from multiple CPUs. If this is required, + * per-CPU callback lists will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +module_param(rcu_task_stall_timeout, int, 0644); + +static void rcu_spawn_tasks_kthread(void); + +/* + * Post an RCU-tasks callback. First call must be from process context + * after the scheduler if fully operational. + */ +void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + needwake = !rcu_tasks_cbs_head; + *rcu_tasks_cbs_tail = rhp; + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + if (needwake) { + rcu_spawn_tasks_kthread(); + wake_up(&rcu_tasks_cbs_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + rcu_lockdep_assert(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!ACCESS_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || + !ACCESS_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + ACCESS_ONCE(t->rcu_tasks_holdout) = false; + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + unsigned long lastreport; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current); + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rcu_tasks_cbs_wq, + rcu_tasks_cbs_head); + if (!rcu_tasks_cbs_head) { + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_sched() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_sched(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_sched() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_sched(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && ACCESS_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); + ACCESS_ONCE(t->rcu_tasks_holdout) = true; + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_sched() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + lastreport = jiffies; + while (!list_empty(&rcu_tasks_holdouts)) { + bool firstreport; + bool needreport; + int rtst; + struct task_struct *t1; + + schedule_timeout_interruptible(HZ); + rtst = ACCESS_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && + time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, &firstreport); + cond_resched(); + } + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_sched() + * to force the needed ordering on all such CPUs. + * + * This synchronize_sched() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_sched() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. + */ + synchronize_sched(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + schedule_timeout_uninterruptible(HZ/10); + } +} + +/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ +static void rcu_spawn_tasks_kthread(void) +{ + static DEFINE_MUTEX(rcu_tasks_kthread_mutex); + static struct task_struct *rcu_tasks_kthread_ptr; + struct task_struct *t; + + if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { + smp_mb(); /* Ensure caller sees full kthread. */ + return; + } + mutex_lock(&rcu_tasks_kthread_mutex); + if (rcu_tasks_kthread_ptr) { + mutex_unlock(&rcu_tasks_kthread_mutex); + return; + } + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + BUG_ON(IS_ERR(t)); + smp_mb(); /* Ensure others see full kthread. */ + ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; + mutex_unlock(&rcu_tasks_kthread_mutex); +} + +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_PROVE_RCU + +/* + * Early boot self test parameters, one for each flavor + */ +static bool rcu_self_test; +static bool rcu_self_test_bh; +static bool rcu_self_test_sched; + +module_param(rcu_self_test, bool, 0444); +module_param(rcu_self_test_bh, bool, 0444); +module_param(rcu_self_test_sched, bool, 0444); + +static int rcu_self_test_counter; + +static void test_callback(struct rcu_head *r) +{ + rcu_self_test_counter++; + pr_info("RCU test callback executed %d\n", rcu_self_test_counter); +} + +static void early_boot_test_call_rcu(void) +{ + static struct rcu_head head; + + call_rcu(&head, test_callback); +} + +static void early_boot_test_call_rcu_bh(void) +{ + static struct rcu_head head; + + call_rcu_bh(&head, test_callback); +} + +static void early_boot_test_call_rcu_sched(void) +{ + static struct rcu_head head; + + call_rcu_sched(&head, test_callback); +} + +void rcu_early_boot_tests(void) +{ + pr_info("Running RCU self tests\n"); + + if (rcu_self_test) + early_boot_test_call_rcu(); + if (rcu_self_test_bh) + early_boot_test_call_rcu_bh(); + if (rcu_self_test_sched) + early_boot_test_call_rcu_sched(); +} + +static int rcu_verify_early_boot_tests(void) +{ + int ret = 0; + int early_boot_test_counter = 0; + + if (rcu_self_test) { + early_boot_test_counter++; + rcu_barrier(); + } + if (rcu_self_test_bh) { + early_boot_test_counter++; + rcu_barrier_bh(); + } + if (rcu_self_test_sched) { + early_boot_test_counter++; + rcu_barrier_sched(); + } + + if (rcu_self_test_counter != early_boot_test_counter) { + WARN_ON(1); + ret = -1; + } + + return ret; +} +late_initcall(rcu_verify_early_boot_tests); +#else +void rcu_early_boot_tests(void) {} +#endif /* CONFIG_PROVE_RCU */ diff --git a/kernel/reboot.c b/kernel/reboot.c index a3a9e240fcdb..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* + * Notifier list for kernel code which wants to be called + * to restart the system. + */ +static ATOMIC_NOTIFIER_HEAD(restart_handler_list); + +/** + * register_restart_handler - Register function to be called to reset + * the system + * @nb: Info about handler function to be called + * @nb->priority: Handler priority. Handlers should follow the + * following guidelines for setting priorities. + * 0: Restart handler of last resort, + * with limited restart capabilities + * 128: Default restart handler; use if no other + * restart handler is expected to be available, + * and/or if restart functionality is + * sufficient to restart the entire system + * 255: Highest priority restart handler, will + * preempt all other restart handlers + * + * Registers a function with code to be called to restart the + * system. + * + * Registered functions will be called from machine_restart as last + * step of the restart sequence (if the architecture specific + * machine_restart function calls do_kernel_restart - see below + * for details). + * Registered functions are expected to restart the system immediately. + * If more than one function is registered, the restart handler priority + * selects which function will be called first. + * + * Restart handlers are expected to be registered from non-architecture + * code, typically from drivers. A typical use case would be a system + * where restart functionality is provided through a watchdog. Multiple + * restart handlers may exist; for example, one restart handler might + * restart the entire system, while another only restarts the CPU. + * In such cases, the restart handler which only restarts part of the + * hardware is expected to register with low priority to ensure that + * it only runs if no other means to restart the system is available. + * + * Currently always returns zero, as atomic_notifier_chain_register() + * always returns zero. + */ +int register_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&restart_handler_list, nb); +} +EXPORT_SYMBOL(register_restart_handler); + +/** + * unregister_restart_handler - Unregister previously registered + * restart handler + * @nb: Hook to be unregistered + * + * Unregisters a previously registered restart handler function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&restart_handler_list, nb); +} +EXPORT_SYMBOL(unregister_restart_handler); + +/** + * do_kernel_restart - Execute kernel restart handler call chain + * + * Calls functions registered with register_restart_handler. + * + * Expected to be called from machine_restart as last step of the restart + * sequence. + * + * Restarts the system immediately if a restart handler function has been + * registered. Otherwise does nothing. + */ +void do_kernel_restart(char *cmd) +{ + atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); +} + void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ @@ -306,8 +387,9 @@ void ctrl_alt_del(void) } char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +static const char reboot_cmd[] = "/sbin/reboot"; -static int __orderly_poweroff(bool force) +static int run_cmd(const char *cmd) { char **argv; static char *envp[] = { @@ -316,8 +398,7 @@ static int __orderly_poweroff(bool force) NULL }; int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + argv = argv_split(GFP_KERNEL, cmd, NULL); if (argv) { ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); @@ -325,8 +406,33 @@ static int __orderly_poweroff(bool force) ret = -ENOMEM; } + return ret; +} + +static int __orderly_reboot(void) +{ + int ret; + + ret = run_cmd(reboot_cmd); + + if (ret) { + pr_warn("Failed to start orderly reboot: forcing the issue\n"); + emergency_sync(); + kernel_restart(NULL); + } + + return ret; +} + +static int __orderly_poweroff(bool force) +{ + int ret; + + ret = run_cmd(poweroff_cmd); + if (ret && force) { pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -355,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); * This may be called from any context to trigger a system shutdown. * If the orderly shutdown fails, it will force an immediate shutdown. */ -int orderly_poweroff(bool force) +void orderly_poweroff(bool force) { if (force) /* do not override the pending "true" */ poweroff_force = true; schedule_work(&poweroff_work); - return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); +static void reboot_work_func(struct work_struct *work) +{ + __orderly_reboot(); +} + +static DECLARE_WORK(reboot_work, reboot_work_func); + +/** + * orderly_reboot - Trigger an orderly system reboot + * + * This may be called from any context to trigger a system reboot. + * If the orderly reboot fails, it will force an immediate reboot. + */ +void orderly_reboot(void) +{ + schedule_work(&reboot_work); +} +EXPORT_SYMBOL_GPL(orderly_reboot); + static int __init reboot_setup(char *str) { for (;;) { diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov <xemul@openvz.org> - * - */ - -#include <linux/types.h> -#include <linux/parser.h> -#include <linux/fs.h> -#include <linux/res_counter.h> -#include <linux/uaccess.h> -#include <linux/mm.h> - -void res_counter_init(struct res_counter *counter, struct res_counter *parent) -{ - spin_lock_init(&counter->lock); - counter->limit = RES_COUNTER_MAX; - counter->soft_limit = RES_COUNTER_MAX; - counter->parent = parent; -} - -static u64 res_counter_uncharge_locked(struct res_counter *counter, - unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - -static int res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force) -{ - int ret = 0; - - if (counter->usage + val > counter->limit) { - counter->failcnt++; - ret = -ENOMEM; - if (!force) - return ret; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return ret; -} - -static int __res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, bool force) -{ - int ret, r; - unsigned long flags; - struct res_counter *c, *u; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val, force); - spin_unlock(&c->lock); - if (r < 0 && !ret) { - ret = r; - *limit_fail_at = c; - if (!force) - break; - } - } - - if (ret < 0 && !force) { - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); - } - } - local_irq_restore(flags); - - return ret; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, false); -} - -int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, true); -} - -u64 res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) -{ - unsigned long flags; - struct res_counter *c; - u64 ret = 0; - - local_irq_save(flags); - for (c = counter; c != top; c = c->parent) { - u64 r; - spin_lock(&c->lock); - r = res_counter_uncharge_locked(c, val); - if (c == counter) - ret = r; - spin_unlock(&c->lock); - } - local_irq_restore(flags); - return ret; -} - -u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - return res_counter_uncharge_until(counter, NULL, val); -} - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -#if BITS_PER_LONG == 32 -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&counter->lock, flags); - ret = *res_counter_member(counter, member); - spin_unlock_irqrestore(&counter->lock, flags); - - return ret; -} -#else -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} -#endif - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *resp) -{ - char *end; - unsigned long long res; - - /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ - if (*buf == '-') { - int rc = kstrtoull(buf + 1, 10, &res); - - if (rc) - return rc; - if (res != 1) - return -EINVAL; - *resp = RES_COUNTER_MAX; - return 0; - } - - res = memparse(buf, &end); - if (*end != '\0') - return -EINVAL; - - if (PAGE_ALIGN(res) >= res) - res = PAGE_ALIGN(res); - else - res = RES_COUNTER_MAX; - - *resp = res; - - return 0; -} diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> +#include <linux/resource_ext.h> #include <asm/io.h> @@ -59,10 +60,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +73,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +332,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -339,9 +352,12 @@ static int find_next_system_ram(struct resource *res, char *name) end = res->end; BUG_ON(start >= end); + if (first_level_children_only) + sibling_only = true; + read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +369,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +382,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +463,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -411,6 +492,42 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +/* + * Search for a resouce entry that fully contains the specified region. + * If found, return 1 if it is RAM, 0 if not. + * If not found, or region is not fully contained, return -1 + * + * Used by the ioremap functions to ensure the user is not remapping RAM and is + * a vast speed up over walking through the resource table page by page. + */ +int region_is_ram(resource_size_t start, unsigned long size) +{ + struct resource *p; + resource_size_t end = start + size - 1; + int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + const char *name = "System RAM"; + int ret = -1; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + if (end < p->start) + continue; + + if (p->start <= start && end <= p->end) { + /* resource fully contains region */ + if ((p->flags != flags) || strcmp(p->name, name)) + ret = 0; + else + ret = 1; + break; + } + if (p->end < start) + break; /* not found */ + } + read_unlock(&resource_lock); + return ret; +} + void __weak arch_remove_reservations(struct resource *avail) { } @@ -917,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) * * request_region creates a new busy region. * - * check_region returns non-zero if the area is already busy. - * * release_region releases a matching busy region. */ @@ -981,36 +1096,6 @@ struct resource * __request_region(struct resource *parent, EXPORT_SYMBOL(__request_region); /** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - free_resource(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - -/** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor * @start: resource start address @@ -1165,6 +1250,76 @@ int release_mem_region_adjustable(struct resource *parent, /* * Managed region resource */ +static void devm_resource_release(struct device *dev, void *ptr) +{ + struct resource **r = ptr; + + release_resource(*r); +} + +/** + * devm_request_resource() - request and reserve an I/O or memory resource + * @dev: device for which to request the resource + * @root: root of the resource tree from which to request the resource + * @new: descriptor of the resource to request + * + * This is a device-managed version of request_resource(). There is usually + * no need to release resources requested by this function explicitly since + * that will be taken care of when the device is unbound from its driver. + * If for some reason the resource needs to be released explicitly, because + * of ordering issues for example, drivers must call devm_release_resource() + * rather than the regular release_resource(). + * + * When a conflict is detected between any existing resources and the newly + * requested resource, an error message will be printed. + * + * Returns 0 on success or a negative error code on failure. + */ +int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new) +{ + struct resource *conflict, **ptr; + + ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + *ptr = new; + + conflict = request_resource_conflict(root, new); + if (conflict) { + dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", + new, conflict->name, conflict); + devres_free(ptr); + return -EBUSY; + } + + devres_add(dev, ptr); + return 0; +} +EXPORT_SYMBOL(devm_request_resource); + +static int devm_resource_match(struct device *dev, void *res, void *data) +{ + struct resource **ptr = res; + + return *ptr == data; +} + +/** + * devm_release_resource() - release a previously requested resource + * @dev: device for which to release the resource + * @new: descriptor of the resource to release + * + * Releases a resource previously requested using devm_request_resource(). + */ +void devm_release_resource(struct device *dev, struct resource *new) +{ + WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, + new)); +} +EXPORT_SYMBOL(devm_release_resource); + struct region_devres { struct resource *parent; resource_size_t start; @@ -1343,6 +1498,30 @@ int iomem_is_exclusive(u64 addr) return err; } +struct resource_entry *resource_list_create_entry(struct resource *res, + size_t extra_size) +{ + struct resource_entry *entry; + + entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL); + if (entry) { + INIT_LIST_HEAD(&entry->node); + entry->res = res ? res : &entry->__res; + } + + return entry; +} +EXPORT_SYMBOL(resource_list_create_entry); + +void resource_list_free(struct list_head *head) +{ + struct resource_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, node) + resource_list_destroy_entry(entry); +} +EXPORT_SYMBOL(resource_list_free); + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c..46be87024875 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg +CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. @@ -148,11 +144,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c0a205101c23 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); static inline struct sched_clock_data *this_scd(void) { - return &__get_cpu_var(sched_clock_data); + return this_cpu_ptr(&sched_clock_data); } static inline struct sched_clock_data *cpu_sdc(int cpu) @@ -420,3 +420,16 @@ u64 local_clock(void) EXPORT_SYMBOL_GPL(cpu_clock); EXPORT_SYMBOL_GPL(local_clock); + +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +u64 __weak running_clock(void) +{ + return local_clock(); +} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. @@ -267,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!READ_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; @@ -287,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; + if (!READ_ONCE(x->done)) + return false; - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + /* + * If ->done, we need to wait for complete() to release ->wait.lock + * otherwise we can end up freeing the completion before complete() + * is done referencing it. + * + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders + * the loads of ->done and ->wait.lock such that we cannot observe + * the lock before complete() acquires it while observing the ->done + * after it's acquired the lock. + */ + smp_rmb(); + spin_unlock_wait(&x->wait.lock); + return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b33449..f9123a82cbb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,22 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#ifdef smp_mb__before_atomic -void __smp_mb__before_atomic(void) -{ - smp_mb__before_atomic(); -} -EXPORT_SYMBOL(__smp_mb__before_atomic); -#endif - -#ifdef smp_mb__after_atomic -void __smp_mb__after_atomic(void) -{ - smp_mb__after_atomic(); -} -EXPORT_SYMBOL(__smp_mb__after_atomic); -#endif - void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -135,10 +119,14 @@ void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update > 0) + lockdep_assert_held(&rq->lock); + + if (rq->clock_skip_update & RQCF_ACT_SKIP) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; rq->clock += delta; update_rq_clock_task(rq, delta); } @@ -243,6 +231,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, char buf[64]; char *cmp; int i; + struct inode *inode; if (cnt > 63) cnt = 63; @@ -253,7 +242,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; cmp = strstrip(buf); + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + mutex_lock(&inode->i_mutex); i = sched_feat_set(cmp); + mutex_unlock(&inode->i_mutex); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -313,59 +306,8 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; /* * this_rq_lock - lock this runqueue and disable interrupts. @@ -442,7 +384,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); @@ -485,6 +435,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -587,30 +542,31 @@ static bool set_nr_if_polling(struct task_struct *p) #endif /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq) { + struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_held(&rq->lock); - if (test_tsk_need_resched(p)) + if (test_tsk_need_resched(curr)) return; - cpu = task_cpu(p); + cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(p); + set_tsk_need_resched(curr); set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(p)) + if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -623,7 +579,7 @@ void resched_cpu(int cpu) if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; - resched_task(cpu_curr(cpu)); + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -684,10 +640,16 @@ static void wake_up_idle_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { + /* + * We just need the target to call irq_exit() and re-evaluate + * the next tick. The nohz full kick at least implies that. + * If needed we can still optimize that later with an + * empty IRQ. + */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) - smp_send_reschedule(cpu); + tick_nohz_full_kick_cpu(cpu); return true; } @@ -730,18 +692,32 @@ static inline bool got_nohz_idle_kick(void) #ifdef CONFIG_NO_HZ_FULL bool sched_can_stop_tick(void) { - struct rq *rq; + /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; - rq = this_rq(); + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); + return rt_se->run_list.prev == rt_se->run_list.next; + } - /* More than one running task need preemption */ - if (rq->nr_running > 1) - return false; + /* + * More than one running task need preemption. + * nr_running update is assumed to be visible + * after IPI is sent from wakers. + */ + if (this_rq()->nr_running > 1) + return false; - return true; + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -999,6 +975,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1006,6 +985,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1022,7 +1002,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (class == rq->curr->sched_class) break; if (class == p->sched_class) { - resched_task(rq->curr); + resched_curr(rq); break; } } @@ -1032,8 +1012,15 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + rq_clock_skip_update(rq, true); +} + +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); } #ifdef CONFIG_SMP @@ -1045,7 +1032,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1066,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -1077,7 +1072,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1203,7 +1198,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1241,7 +1236,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1273,7 +1268,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1398,7 +1393,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1467,7 +1463,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1526,7 +1522,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1568,9 +1564,7 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) - && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -1587,7 +1581,6 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1612,6 +1605,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +out: + rcu_read_unlock(); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1734,7 +1751,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -1768,6 +1785,24 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -1783,6 +1818,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1790,11 +1828,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + init_dl_task_timer(&p->dl); + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -1817,12 +1852,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; - INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1969,6 +2002,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -1977,6 +2012,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -1994,25 +2031,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2020,6 +2038,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2087,7 +2108,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2180,7 +2201,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2192,10 +2212,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2235,6 +2261,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2269,29 +2296,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); - -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); -#endif + if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2325,21 +2345,14 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2358,6 +2371,18 @@ unsigned long nr_running(void) return sum; } +/* + * Check if only the current task is running on the cpu. + */ +bool single_task_running(void) +{ + if (cpu_rq(smp_processor_id())->nr_running == 1) + return true; + else + return false; +} +EXPORT_SYMBOL(single_task_running); + unsigned long long nr_context_switches(void) { int i; @@ -2385,6 +2410,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* @@ -2422,39 +2454,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - if (task_current(rq, p)) { - update_rq_clock(rq); - ns = rq_clock_task(rq) - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -2463,7 +2462,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns = 0; + u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* @@ -2474,13 +2473,24 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we race with it leaving cpu, we'll take a lock. So we're correct. * If we race with it entering cpu, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (!p->on_cpu) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &flags); return ns; @@ -2638,6 +2648,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine @@ -2727,6 +2740,10 @@ again: * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space + * + * WARNING: all callers must re-check need_resched() afterward and reschedule + * accordingly in case an event triggered the need for rescheduling (such as + * an interrupt waking up a task) while preemption was disabled in __schedule(). */ static void __sched __schedule(void) { @@ -2735,11 +2752,10 @@ static void __sched __schedule(void) struct rq *rq; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); prev = rq->curr; schedule_debug(prev); @@ -2755,6 +2771,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2779,36 +2797,27 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev)) update_rq_clock(rq); next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->skip_clock_update = 0; + rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); post_schedule(rq); sched_preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) @@ -2828,7 +2837,9 @@ asmlinkage __visible void __sched schedule(void) struct task_struct *tsk = current; sched_submit_work(tsk); - __schedule(); + do { + __schedule(); + } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2840,10 +2851,14 @@ asmlinkage __visible void __sched schedule_user(void) * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != CONTEXT_USER, but that will trigger + * too frequently to make sense yet. */ - user_exit(); + enum ctx_state prev_state = exception_enter(); schedule(); - user_enter(); + exception_exit(prev_state); } #endif @@ -2859,6 +2874,21 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +static void __sched notrace preempt_schedule_common(void) +{ + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -2874,20 +2904,51 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; + preempt_schedule_common(); +} +NOKPROBE_SYMBOL(preempt_schedule); +EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + do { __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); barrier(); } while (need_resched()); } -NOKPROBE_SYMBOL(preempt_schedule); -EXPORT_SYMBOL(preempt_schedule); +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -2944,7 +3005,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2971,15 +3032,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); - p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -2991,8 +3051,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || (p->pi_top_task && - dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + if (!dl_prio(p->normal_prio) || + (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; p->dl.dl_throttled = 0; enqueue_flag = ENQUEUE_REPLENISH; @@ -3008,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } @@ -3015,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3026,7 +3089,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3047,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3057,14 +3120,14 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_curr(rq); } out_unlock: task_rq_unlock(rq, p, &flags); @@ -3192,23 +3255,45 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { int policy = attr->sched_policy; - if (policy == -1) /* setparam */ + if (policy == SETPARAM_POLICY) policy = p->policy; p->policy = policy; @@ -3317,13 +3402,27 @@ static bool check_same_owner(struct task_struct *p) return match; } +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3445,7 +3544,7 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; - if (dl_policy(policy)) + if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; @@ -3520,19 +3619,19 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -3557,10 +3656,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, .sched_nice = PRIO_TO_NICE(p->static_prio), }; - /* - * Fixup the legacy SCHED_RESET_ON_FORK hack - */ - if (policy & SCHED_RESET_ON_FORK) { + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; @@ -3730,7 +3827,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - return do_sched_setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); } /** @@ -3958,14 +4055,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -3978,13 +4075,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; - goto out_unlock; + rcu_read_unlock(); + goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: @@ -4002,7 +4100,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); @@ -4138,17 +4236,10 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static void __cond_resched(void) -{ - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); -} - int __sched _cond_resched(void) { if (should_resched()) { - __cond_resched(); + preempt_schedule_common(); return 1; } return 0; @@ -4173,7 +4264,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) - __cond_resched(); + preempt_schedule_common(); else cpu_relax(); ret = 1; @@ -4189,7 +4280,7 @@ int __sched __cond_resched_softirq(void) if (should_resched()) { local_bh_enable(); - __cond_resched(); + preempt_schedule_common(); local_bh_disable(); return 1; } @@ -4285,7 +4376,7 @@ again: * fairness. */ if (preempt && rq != p_rq) - resched_task(p_rq->curr); + resched_curr(p_rq); } out_unlock: @@ -4304,36 +4395,29 @@ EXPORT_SYMBOL_GPL(yield_to); * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - long __sched io_schedule_timeout(long timeout) { - struct rq *rq = raw_rq(); + int old_iowait = current->in_iowait; + struct rq *rq; long ret; + current->in_iowait = 1; + if (old_iowait) + blk_schedule_flush_plug(current); + else + blk_flush_plug(current); + delayacct_blkio_start(); + rq = raw_rq(); atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->in_iowait = old_iowait; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); + return ret; } +EXPORT_SYMBOL(io_schedule_timeout); /** * sys_sched_get_priority_max - return maximum RT priority. @@ -4444,9 +4528,10 @@ void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned state; + unsigned long state = p->state; - state = p->state ? __ffs(p->state) + 1 : 0; + if (state) + state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 @@ -4463,8 +4548,10 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + ppid = 0; rcu_read_lock(); - ppid = task_pid_nr(rcu_dereference(p->real_parent)); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), ppid, @@ -4486,7 +4573,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4494,7 +4581,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -4549,7 +4636,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4569,10 +4656,115 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + if (!cpumask_weight(cur)) + return ret; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -4626,14 +4818,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4654,20 +4847,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4676,16 +4869,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { - dequeue_task(rq_src, p, 0); - set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } @@ -4717,22 +4906,22 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -4752,6 +4941,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; @@ -5160,31 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long flags; - long cpu = (long)hcpu; - switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active(cpu, false); - - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); - bool overflow; - int cpus; - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - if (overflow) - return notifier_from_errno(-EBUSY); - } + set_cpu_active((long)hcpu, false); return NOTIFY_OK; + default: + return NOTIFY_DONE; } - - return NOTIFY_DONE; } static int __init migration_init(void) @@ -5232,9 +5409,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; - char str[256]; - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5247,7 +5422,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -5266,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5292,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %*pbl", + cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); @@ -5650,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -5720,7 +5881,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5731,10 +5892,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5744,10 +5905,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); @@ -5762,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5985,7 +6144,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6071,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ @@ -6157,7 +6319,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6172,6 +6334,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6225,6 +6437,10 @@ static void sched_init_numa(void) if (!sched_debug()) break; } + + if (!level) + return; + /* * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). @@ -6304,6 +6520,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -6465,6 +6684,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + } set_domain_attribute(sd, attr); @@ -6765,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, */ case CPU_ONLINE: - case CPU_DOWN_FAILED: cpuset_update_active_cpus(true); break; default: @@ -6777,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + unsigned long flags; + long cpu = (long)hcpu; + struct dl_bw *dl_b; + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (overflow) + return notifier_from_errno(-EBUSY); + } cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: @@ -6865,9 +7119,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -6887,13 +7138,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6926,8 +7177,8 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); - init_dl_rq(&rq->dl, rq); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6967,7 +7218,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -7005,6 +7256,11 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7014,11 +7270,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ @@ -7042,6 +7293,24 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7061,6 +7330,9 @@ void __might_sleep(const char *file, int line, int preempt_offset) in_atomic(), irqs_disabled(), current->pid, current->comm); + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -7073,7 +7345,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7084,15 +7356,15 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); - resched_task(rq->curr); + resched_curr(rq); } check_class_changed(rq, p, prev_class, old_prio); @@ -7104,12 +7376,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7124,21 +7396,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + task_rq_unlock(rq, p, &flags); + } + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7278,36 +7545,40 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgrp_id, - lockdep_is_held(&tsk->sighand->siglock)), + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); @@ -7325,10 +7596,16 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) return 1; - } while_each_thread(g, p); + } return 0; } @@ -7417,6 +7694,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, { int i, err = 0; + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); @@ -7473,9 +7761,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; - if (rt_period == 0) - return -EINVAL; - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -7532,11 +7817,12 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_constraints(void) +static int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; @@ -7550,13 +7836,16 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } @@ -7567,6 +7856,7 @@ static int sched_dl_global_constraints(void) static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7580,11 +7870,14 @@ static void sched_dl_do_global(void) * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } } @@ -7625,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, if (ret) goto undo; - ret = sched_rt_global_constraints(); + ret = sched_dl_global_validate(); if (ret) goto undo; - ret = sched_dl_global_constraints(); + ret = sched_rt_global_constraints(); if (ret) goto undo; @@ -7714,6 +8007,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -7803,6 +8101,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Prevent race between setting of cfs_rq->runtime_enabled and + * unthrottle_offline_cfs_rqs(). + */ + get_online_cpus(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7828,7 +8131,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) } raw_spin_unlock_irq(&cfs_b->lock); - for_each_possible_cpu(i) { + for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; @@ -7844,6 +8147,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); + put_online_cpus(); return ret; } @@ -7959,7 +8263,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -7970,7 +8274,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } @@ -8080,10 +8384,11 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .base_cftypes = cpu_files, + .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .base_cftypes = files, + .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && @@ -188,6 +187,26 @@ out: } /* + * cpudl_set_freecpu - Set the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_set_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_set_cpu(cpu, cp->free_cpus); +} + +/* + * cpudl_clear_freecpu - Clear the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_clear_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_clear_cpu(cpu, cp->free_cpus); +} + +/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ @@ -205,7 +224,7 @@ int cpudl_init(struct cpudl *cp) if (!cp->elements) return -ENOMEM; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { kfree(cp->elements); return -ENOMEM; } @@ -213,8 +232,6 @@ int cpudl_init(struct cpudl *cp) for_each_possible_cpu(i) cp->elements[i].idx = IDX_INVALID; - cpumask_setall(cp->free_cpus); - return 0; } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -24,10 +24,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#else -#define cpudl_set(cp, cpu, dl) do { } while (0) -#define cpudl_init() do { } while (0) #endif /* CONFIG_SMP */ #endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) #endif #endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; + /* Attempt a lockless read on the first round. */ + nextseq = 0; do { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } @@ -550,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) dl_b->total_bw = 0; } -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->rb_root = RB_ROOT; @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -306,7 +352,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * the overrunning entity can't interfere with other entity in the system and * can't make them miss their deadlines. Reasons why this kind of overruns * could happen are, typically, a entity voluntarily trying to overcome its - * runtime, or it just underestimated it during sched_setscheduler_ex(). + * runtime, or it just underestimated it during sched_setattr(). */ static void replenish_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -350,6 +396,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -506,47 +557,76 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); + unsigned long flags; struct rq *rq; -again: - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (rq != task_rq(p)) { - /* Task was moved, retrying. */ - raw_spin_unlock(&rq->lock); - goto again; - } + rq = task_rq_lock(p, &flags); /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (p->on_rq) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_task(rq->curr); + #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } #endif + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; } + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif unlock: - raw_spin_unlock(&rq->lock); + task_rq_unlock(rq, p, &flags); return HRTIMER_NORESTART; } @@ -555,11 +635,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -567,24 +642,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -625,16 +683,15 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) - resched_task(curr); + resched_curr(rq); } /* @@ -823,10 +880,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } @@ -847,8 +904,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -856,7 +924,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); @@ -901,7 +969,14 @@ static void yield_task_dl(struct rq *rq) rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } + update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP @@ -914,7 +989,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -964,7 +1039,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; - resched_task(rq->curr); + resched_curr(rq); } static int pull_dl_task(struct rq *this_rq); @@ -979,7 +1054,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -997,10 +1072,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); +} +#else /* !CONFIG_SCHED_HRTICK */ +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ } #endif @@ -1030,7 +1106,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1055,10 +1131,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) /* Running task will never be pushed. */ dequeue_pushable_dl_task(rq, p); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); -#endif set_post_schedule(rq); @@ -1077,10 +1151,14 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); -#ifdef CONFIG_SCHED_HRTICK - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + /* + * Even when we have runtime, update_curr_dl() might have resulted in us + * not being the leftmost task anymore. In that case NEED_RESCHED will + * be set and schedule() will start a new hrtick for the next task. + */ + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); -#endif } static void task_fork_dl(struct task_struct *p) @@ -1100,6 +1178,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1124,10 +1203,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } @@ -1158,7 +1235,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); int best_cpu, cpu = task_cpu(task); @@ -1169,6 +1246,10 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1257,7 +1338,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1378,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1311,6 +1393,7 @@ static int push_dl_task(struct rq *rq) { struct task_struct *next_task; struct rq *later_rq; + int ret = 0; if (!rq->dl.overloaded) return 0; @@ -1333,7 +1416,7 @@ retry: if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1356,7 +1439,6 @@ retry: * The task is still there. We don't try * again, some other cpu will pull it when ready. */ - dequeue_pushable_dl_task(rq, next_task); goto out; } @@ -1372,15 +1454,16 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + ret = 1; - resched_task(later_rq->curr); + resched_curr(later_rq); double_unlock_balance(rq, later_rq); out: put_task_struct(next_task); - return 1; + return ret; } static void push_dl_tasks(struct rq *rq) @@ -1443,7 +1526,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1486,7 +1569,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1495,10 +1578,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1515,8 +1621,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1540,6 +1644,7 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } @@ -1551,6 +1656,7 @@ static void rq_offline_dl(struct rq *rq) dl_clear_overload(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } void init_sched_dl_class(void) @@ -1564,20 +1670,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + /* XXX we should retain the bw until 0-lag */ + cancel_dl_timer(rq, p); + __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1588,22 +1722,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) { int check_resched = 1; - /* - * If p is throttled, don't consider the possibility - * of preempting rq->curr, the check will be done right - * after its runtime will get replenished. - */ - if (unlikely(p->dl.dl_throttled)) - return; - - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } @@ -1614,7 +1745,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately @@ -1632,14 +1763,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* * Again, we don't know if p has a earlier * or later deadline, so let's blindly set a * (maybe not needed) rescheduling point. */ - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else switched_to_dl(rq, p); @@ -1673,4 +1804,15 @@ const struct sched_class dl_sched_class = { .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, + + .update_curr = update_curr_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.running_avg_sum); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN @@ -150,7 +152,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,16 +160,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + } + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -217,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -264,6 +265,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -302,6 +309,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); + PN(clock_task); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); @@ -332,10 +340,9 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -533,8 +540,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); @@ -633,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/cpuidle.h> #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> @@ -665,20 +666,22 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -724,6 +727,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -826,11 +834,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -865,7 +874,6 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; - struct list_head task_list; struct rcu_head rcu; nodemask_t active_nodes; @@ -893,18 +901,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -912,14 +926,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -928,11 +1007,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -940,15 +1020,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) + return 0; + + total_faults = p->numa_group->total_faults; + + if (!total_faults) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1038,7 +1132,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,9 +1157,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -1077,6 +1175,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1096,32 +1195,59 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; + long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); + orig_src_load = env->src_stats.load; - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* @@ -1136,15 +1262,33 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; + int dist = env->dist; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); + + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) + goto unlock; /* * "imp" is the fault differential for the source task between the @@ -1163,8 +1307,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1177,26 +1321,21 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; @@ -1204,20 +1343,34 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; if (cur) { load = task_h_load(cur); @@ -1225,10 +1378,16 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: @@ -1266,7 +1425,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1294,40 +1453,51 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1336,8 +1506,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an @@ -1365,7 +1546,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1415,12 +1596,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of @@ -1441,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -1477,7 +1660,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1505,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -1514,6 +1697,94 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group = NODE_MASK_NONE; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1541,18 +1812,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1562,21 +1838,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1595,30 +1877,17 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = preferred_group_nid(p, max_group_nid); } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } @@ -1652,7 +1921,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * @@ -1661,11 +1929,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; - list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); } @@ -1720,13 +1987,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; - list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; grp->nr_tasks++; @@ -1746,27 +2012,23 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; - list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1788,29 +2050,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1849,9 +2097,11 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -1930,8 +2180,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma)) { continue; + } /* * Shared library pages mapped by multiple processes are not @@ -2195,8 +2447,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (n<PERIOD) + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n<PERIOD) * * To achieve constant time decay_load. */ @@ -2266,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2294,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2307,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta_w; delta -= delta_w; @@ -2317,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta; return decayed; } @@ -2342,11 +2607,13 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2361,6 +2628,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; @@ -2379,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2432,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2450,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2469,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2485,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2497,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2583,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2601,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -2899,7 +3201,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2923,7 +3225,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -2938,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3063,7 +3366,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3254,7 +3557,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3360,7 +3663,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3410,14 +3717,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3448,7 +3756,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3494,22 +3802,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3517,10 +3820,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3631,10 +3934,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3645,7 +3947,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } @@ -3771,10 +4073,27 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} + static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -3788,6 +4107,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * there's some valid quota amount */ cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3831,6 +4156,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3854,17 +4180,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } @@ -4049,10 +4367,15 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -4178,7 +4501,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; @@ -4241,8 +4564,8 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4285,47 +4608,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; + balanced = this_eff_load <= prev_eff_load; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* @@ -4393,20 +4699,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* @@ -4453,6 +4785,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -4475,14 +4834,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -4669,7 +5022,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -4723,7 +5076,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -4904,7 +5257,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } set_skip_buddy(se); @@ -5077,28 +5430,18 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; /* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - -/* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5108,7 +5451,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5118,7 +5461,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5130,7 +5473,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5169,7 +5512,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -5218,6 +5561,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5272,28 +5618,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } - return 1; } @@ -5302,47 +5636,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + +/* + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + detach_task(p, env); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is detach_tasks) + * so we can safely collect stats here rather than + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; } static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5373,14 +5723,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5400,13 +5752,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); - return pulled; + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); + + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5525,6 +5922,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5534,12 +5938,12 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ - int group_has_free_capacity; + enum group_type group_type; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5576,6 +5980,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5608,35 +6014,23 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return SCHED_CAPACITY_SCALE; -} + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) -{ - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; - - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -5645,52 +6039,35 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } - - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + used = div_u64(avg, total); - total >>= SCHED_CAPACITY_SHIFT; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - } - - sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); + capacity *= arch_scale_cpu_capacity(sd, cpu); else - capacity *= default_scale_capacity(sd, cpu); + capacity *= default_scale_cpu_capacity(sd, cpu); capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -5705,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5717,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5737,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -5760,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -5830,31 +6188,62 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. + */ +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running < sgs->group_weight) + return true; + + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; + + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running <= sgs->group_weight) + return false; + + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + return false; +} - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_no_capacity) + return group_overloaded; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if (sg_imbalanced(group)) + return group_imbalanced; - return capacity_factor; + return group_other; } /** @@ -5864,10 +6253,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5884,7 +6275,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->group_usage += get_cpu_usage(i); + sgs->sum_nr_running += rq->cfs.h_nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5903,11 +6299,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity_factor = sg_capacity_factor(env, group); - - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -5928,13 +6321,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5942,8 +6341,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -5995,6 +6393,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6015,24 +6414,28 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6049,6 +6452,13 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6179,7 +6589,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6199,17 +6609,17 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6272,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6288,16 +6699,16 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* - * If the local group is more busy than the selected busiest group + * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ if (local->avg_load >= busiest->avg_load) @@ -6312,13 +6723,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* @@ -6351,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6380,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6390,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* @@ -6438,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6490,7 +6914,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, @@ -6501,6 +6925,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6536,6 +6961,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6545,28 +6973,33 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); - local_irq_restore(flags); + cur_ld_moved = detach_tasks(&env); /* - * some other cpu did the load balance for us. + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + + local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -6616,10 +7049,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6630,7 +7061,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6695,7 +7126,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -6704,6 +7135,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -6767,7 +7215,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -6864,6 +7313,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6883,9 +7333,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6906,16 +7353,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } @@ -7219,22 +7672,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7248,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } @@ -7295,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* @@ -7325,6 +7791,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) @@ -7398,7 +7866,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7413,7 +7881,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7423,7 +7891,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7438,11 +7906,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7469,15 +7937,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7486,7 +7954,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } @@ -7523,7 +7991,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7542,7 +8010,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7553,14 +8021,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP @@ -7783,6 +8251,8 @@ const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .update_curr = update_curr_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> +#include <linux/suspend.h> #include <asm/tlb.h> @@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!tif_need_resched() && + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -79,7 +81,8 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool broadcast; + unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -103,25 +106,37 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. */ - next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { local_irq_enable(); - else - arch_cpu_idle(); + goto exit_idle; + } - goto exit_idle; + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); } - + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -135,7 +150,7 @@ use_default: goto exit_idle; } - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; /* * Tell the time framework to switch to a broadcast timer @@ -143,11 +158,11 @@ use_default: * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ - if (broadcast && - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + if (broadcast && tick_broadcast_enter()) goto use_default; - trace_cpu_idle_rcuidle(next_state, dev->cpu); + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); /* * Enter the idle state previously returned by the governor decision. @@ -156,15 +171,17 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + tick_broadcast_exit(); /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); @@ -177,8 +194,23 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } +DEFINE_PER_CPU(bool, cpu_dead_idle); + /* * Generic idle loop implementation * @@ -203,8 +235,13 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) + if (cpu_is_offline(smp_processor_id())) { + rcu_cpu_notify(NULL, CPU_DYING_IDLE, + (void *)(long)smp_processor_id()); + smp_mb(); /* all activity before dead. */ + this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); + } local_irq_disable(); arch_cpu_idle_enter(); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..c65dac8c97cd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) */ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->idle); + resched_curr(rq); } static struct task_struct * @@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task return 0; } +static void update_curr_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, + .update_curr = update_curr_idle, }; diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a49083192c64..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include <linux/slab.h> +#include <linux/irq_work.h> int sched_rr_timeslice = RR_TIMESLICE; @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +#ifdef CONFIG_SMP +static void push_irq_work_func(struct irq_work *work); +#endif + +void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; int i; @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); #endif +#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } @@ -463,9 +475,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +489,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + resched_curr(rq); } } @@ -566,7 +579,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) return; enqueue_top_rt_rq(rt_rq); - resched_task(rq->curr); + resched_curr(rq); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -740,6 +753,9 @@ balanced: rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); + + /* Make rt_rq available for pick_next_task() */ + sched_rt_rq_enqueue(rt_rq); } } @@ -827,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled check_preempt_curr() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_skip_update(rq, false); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -948,7 +967,7 @@ static void update_curr_rt(struct rq *rq) raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -1297,9 +1316,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (p->nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1336,7 +1352,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); @@ -1347,23 +1368,29 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule * to try and push current away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } #endif /* CONFIG_SMP */ @@ -1374,7 +1401,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1444,7 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1464,8 +1491,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); @@ -1522,7 +1548,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1608,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* @@ -1620,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1654,7 +1690,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1690,7 +1726,7 @@ retry: * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1737,7 +1773,7 @@ retry: activate_task(lowest_rq, next_task, 0); ret = 1; - resched_task(lowest_rq->curr); + resched_curr(lowest_rq); double_unlock_balance(rq, lowest_rq); @@ -1754,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) ; } +#ifdef HAVE_RT_PUSH_IPI +/* + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. + * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. + */ +static int rto_next_cpu(struct rq *rq) +{ + int prev_cpu = rq->rt.push_cpu; + int cpu; + + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + + /* + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. + */ + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + + } else if (cpu >= nr_cpu_ids) { + /* + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. + */ + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; + + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; +} + +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu; + + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } + + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; + + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + + irq_work_queue_on(&rq->rt.push_work, cpu); +} + +/* Called from hardirq context */ +static void try_to_push_tasks(void *arg) +{ + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; + int cpu; + + this_cpu = rt_rq->push_cpu; + + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); + push_rt_task(rq); + raw_spin_unlock(&rq->lock); + } + + /* Pass the IPI to the next rt overloaded queue */ + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } + + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); +} +#endif /* HAVE_RT_PUSH_IPI */ + static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; @@ -1769,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) */ smp_rmb(); +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return 0; + } +#endif + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1805,7 +2006,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1866,7 +2067,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1932,11 +2133,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) - resched_task(rq->curr); + resched_curr(rq); } void __init init_sched_rt_class(void) @@ -1966,7 +2167,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1974,7 +2175,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -1985,7 +2186,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { @@ -2003,11 +2204,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * Only reschedule if p is still on the same runqueue. */ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* For UP simply resched on drop of prio */ if (oldprio < p->prio) - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else { /* @@ -2016,7 +2217,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * then reschedule. */ if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -2069,7 +2270,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } @@ -2125,6 +2326,8 @@ const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + + .update_curr = update_curr_rt, }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/irq_work.h> #include <linux/tick.h> #include <linux/slab.h> @@ -14,6 +15,11 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -126,6 +132,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: @@ -168,6 +177,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -184,7 +212,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; @@ -335,8 +363,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; @@ -391,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* RT IPI pull logic requires IRQ_WORK */ +#ifdef CONFIG_IRQ_WORK +# define HAVE_RT_PUSH_IPI +#endif + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -408,7 +447,13 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; #endif +#endif /* CONFIG_SMP */ int rt_queued; int rt_throttled; @@ -477,6 +522,9 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* Indicate more than one runnable task for any CPU */ + bool overload; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -528,8 +576,6 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif - int skip_clock_update; - /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -558,6 +604,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; + unsigned int clock_skip_update; u64 clock; u64 clock_task; @@ -568,6 +615,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ @@ -633,6 +681,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -644,25 +697,62 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) +#define this_rq() this_cpu_ptr(&runqueues) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(&runqueues) + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_skip_update |= RQCF_REQ_SKIP; + else + rq->clock_skip_update &= ~RQCF_REQ_SKIP; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -736,7 +826,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -884,20 +974,10 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct static_key *key) -{ - return static_key_true(key); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct static_key *key) -{ - return static_key_false(key); /* Out of line branch. */ -} - #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ - return static_branch__##enabled(key); \ + return static_key_##enabled(key); \ } #include "features.h" @@ -949,6 +1029,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -960,7 +1049,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -998,35 +1086,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ @@ -1142,6 +1201,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1150,6 +1214,8 @@ struct sched_class { unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); + void (*update_curr) (struct rq *rq); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif @@ -1187,6 +1253,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1196,7 +1286,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; @@ -1218,15 +1308,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(rq->cpu)) { - /* Order rq->nr_running write against the IPI */ - smp_wmb(); - smp_send_reschedule(rq->cpu); + /* + * Tick is needed if more than one task runs on a CPU. + * Send the target an IPI to kick it out of nohz mode. + * + * We assume that IPI implies full memory barrier and the + * new value of rq->nr_running is visible on reception + * from the target. + */ + tick_nohz_full_kick_cpu(rq->cpu); } - } #endif + } } static inline void sub_nr_running(struct rq *rq, unsigned count) @@ -1286,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else @@ -1298,6 +1408,82 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1482,10 +1668,11 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index a476bea17fbc..87e2c9f0c33e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -15,11 +15,6 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; if (v == (void *)1) { seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); @@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); + seq_printf(seq, "domain%d %*pb", dcount++, + cpumask_pr_args(sched_domain_span(sd))); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", @@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v) rcu_read_unlock(); #endif } - kfree(mask_str); return 0; } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..79ffec45a6ac 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); @@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) return 0; } +static void update_curr_stop(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU stop tasks: */ @@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, + .update_curr = update_curr_stop, }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20ae657b..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; @@ -319,14 +385,14 @@ EXPORT_SYMBOL(wake_bit_function); */ int __sched __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { int ret = 0; do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); + ret = (*action)(&q->key); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -334,7 +400,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -343,9 +409,21 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + wait.key.timeout = jiffies + timeout; + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { do { int ret; @@ -353,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(q->key.flags); + ret = action(&q->key); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -365,7 +443,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -502,3 +580,45 @@ void wake_up_atomic_t(atomic_t *p) __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); } EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + io_schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc24739c..4f44028943e6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,15 +18,18 @@ #include <linux/compat.h> #include <linux/sched.h> #include <linux/seccomp.h> +#include <linux/slab.h> +#include <linux/syscalls.h> -/* #define SECCOMP_DEBUG 1 */ +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +#include <asm/syscall.h> +#endif #ifdef CONFIG_SECCOMP_FILTER -#include <asm/syscall.h> #include <linux/filter.h> +#include <linux/pid.h> #include <linux/ptrace.h> #include <linux/security.h> -#include <linux/slab.h> #include <linux/tracehook.h> #include <linux/uaccess.h> @@ -54,7 +57,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - struct sk_filter *prog; + struct bpf_prog *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -87,7 +90,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) * @filter: filter to verify * @flen: length of filter * - * Takes a previously checked filter (by sk_chk_filter) and + * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. @@ -170,53 +173,189 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(int syscall) +static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f; - struct seccomp_data sd; + struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); + struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ - if (WARN_ON(current->seccomp.filter == NULL)) + if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - populate_seccomp_data(&sd); + /* Make sure cross-thread synced filter points somewhere sane. */ + smp_read_barrier_depends(); + + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + for (; f; f = f->prev) { + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } return ret; } +#endif /* CONFIG_SECCOMP_FILTER */ + +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ + assert_spin_locked(¤t->sighand->siglock); + + if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) + return false; + + return true; +} + +static inline void seccomp_assign_mode(struct task_struct *task, + unsigned long seccomp_mode) +{ + assert_spin_locked(&task->sighand->siglock); + + task->seccomp.mode = seccomp_mode; + /* + * Make sure TIF_SECCOMP cannot be set before the mode (and + * filter) is set. + */ + smp_mb__before_atomic(); + set_tsk_thread_flag(task, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, + struct seccomp_filter *child) +{ + /* NULL is the root ancestor. */ + if (parent == NULL) + return 1; + for (; child; child = child->prev) + if (child == parent) + return 1; + return 0; +} /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Validate all threads being eligible for synchronization. */ + caller = current; + for_each_thread(caller, thread) { + pid_t failed; + + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || + (thread->seccomp.mode == SECCOMP_MODE_FILTER && + is_ancestor(thread->seccomp.filter, + caller->seccomp.filter))) + continue; + + /* Return the first thread that cannot be synchronized. */ + failed = task_pid_vnr(thread); + /* If the pid cannot be resolved, then return -ESRCH */ + if (unlikely(WARN_ON(failed == 0))) + failed = -ESRCH; + return failed; + } + + return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ + struct task_struct *thread, *caller; + + BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + assert_spin_locked(¤t->sighand->siglock); + + /* Synchronize all threads. */ + caller = current; + for_each_thread(caller, thread) { + /* Skip current, since it needs no changes. */ + if (thread == caller) + continue; + + /* Get a task reference for the new leaf node. */ + get_seccomp_filter(caller); + /* + * Drop the task reference to the shared ancestor since + * current's path will hold a reference. (This also + * allows a put before the assignment.) + */ + put_seccomp_filter(thread); + smp_store_release(&thread->seccomp.filter, + caller->seccomp.filter); + /* + * Opt the other thread into seccomp if needed. + * As threads are considered to be trust-realm + * equivalent (see ptrace_may_access), it is safe to + * allow one thread to transition the other. + */ + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + } + } +} + +/** + * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure. */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *filter; - unsigned long fp_size = fprog->len * sizeof(struct sock_filter); - unsigned long total_insns = fprog->len; + unsigned long fp_size; struct sock_filter *fp; int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) - return -EINVAL; - - for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ - if (total_insns > MAX_INSNS_PER_PATH) - return -ENOMEM; + return ERR_PTR(-EINVAL); + BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); + fp_size = fprog->len * sizeof(struct sock_filter); /* * Installing a seccomp filter requires that the task has @@ -224,14 +363,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ - if (!current->no_new_privs && + if (!task_no_new_privs(current) && security_capable_noaudit(current_cred(), current_user_ns(), CAP_SYS_ADMIN) != 0) - return -EACCES; + return ERR_PTR(-EACCES); fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); if (!fp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Copy the instructions from fprog. */ ret = -EFAULT; @@ -239,7 +378,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(fp, fprog->len); + ret = bpf_check_classic(fp, fprog->len); if (ret) goto free_prog; @@ -248,8 +387,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (ret) goto free_prog; - /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ - ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + /* Convert 'sock_filter' insns to 'bpf_insn' insns */ + ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) goto free_prog; @@ -260,48 +399,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(sk_filter_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; - ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; - sk_filter_select_runtime(filter->prog); + bpf_prog_select_runtime(filter->prog); - /* - * If there is an existing filter, make it the prev and don't drop its - * task reference. - */ - filter->prev = current->seccomp.filter; - current->seccomp.filter = filter; - return 0; + return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: kfree(fp); - return ret; + return ERR_PTR(ret); } /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ -static long seccomp_attach_user_filter(char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; - long ret = -EFAULT; + struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (is_compat_task()) { @@ -314,9 +447,56 @@ static long seccomp_attach_user_filter(char __user *user_filter) #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; - ret = seccomp_attach_filter(&fprog); + filter = seccomp_prepare_filter(&fprog); out: - return ret; + return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags: flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Caller must be holding current->sighand->siglock lock. + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, + struct seccomp_filter *filter) +{ + unsigned long total_insns; + struct seccomp_filter *walker; + + assert_spin_locked(¤t->sighand->siglock); + + /* Validate resulting filter length. */ + total_insns = filter->prog->len; + for (walker = current->seccomp.filter; walker; walker = walker->prev) + total_insns += walker->prog->len + 4; /* 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* If thread sync has been requested, check that it is possible. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + int ret; + + ret = seccomp_can_sync_threads(); + if (ret) + return ret; + } + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + + /* Now that the new filter is in place, synchronize to all threads. */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + seccomp_sync_threads(); + + return 0; } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -329,6 +509,14 @@ void get_seccomp_filter(struct task_struct *tsk) atomic_inc(&orig->usage); } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_free(filter->prog); + kfree(filter); + } +} + /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ void put_seccomp_filter(struct task_struct *tsk) { @@ -337,8 +525,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - sk_filter_free(freeme->prog); - kfree(freeme); + seccomp_filter_free(freeme); } } @@ -380,92 +567,185 @@ static int mode1_syscalls_32[] = { }; #endif -int __secure_computing(int this_syscall) +static void __secure_computing_strict(int this_syscall) +{ + int *syscall_whitelist = mode1_syscalls; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + syscall_whitelist = mode1_syscalls_32; +#endif + do { + if (*syscall_whitelist == this_syscall) + return; + } while (*++syscall_whitelist); + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + do_exit(SIGKILL); +} + +#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER +void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - int exit_sig = 0; - int *syscall; - u32 ret; + + if (mode == 0) + return; + else if (mode == SECCOMP_MODE_STRICT) + __secure_computing_strict(this_syscall); + else + BUG(); +} +#else +int __secure_computing(void) +{ + u32 phase1_result = seccomp_phase1(NULL); + + if (likely(phase1_result == SECCOMP_PHASE1_OK)) + return 0; + else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) + return -1; + else + return seccomp_phase2(phase1_result); +} + +#ifdef CONFIG_SECCOMP_FILTER +static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +{ + u32 filter_ret, action; + int data; + + /* + * Make sure that any changes to mode from another thread have + * been seen after TIF_SECCOMP was seen. + */ + rmb(); + + filter_ret = seccomp_run_filters(sd); + data = filter_ret & SECCOMP_RET_DATA; + action = filter_ret & SECCOMP_RET_ACTION; + + switch (action) { + case SECCOMP_RET_ERRNO: + /* Set low-order bits as an errno, capped at MAX_ERRNO. */ + if (data > MAX_ERRNO) + data = MAX_ERRNO; + syscall_set_return_value(current, task_pt_regs(current), + -data, 0); + goto skip; + + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, task_pt_regs(current)); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + + case SECCOMP_RET_TRACE: + return filter_ret; /* Save the rest for phase 2. */ + + case SECCOMP_RET_ALLOW: + return SECCOMP_PHASE1_OK; + + case SECCOMP_RET_KILL: + default: + audit_seccomp(this_syscall, SIGSYS, action); + do_exit(SIGSYS); + } + + unreachable(); + +skip: + audit_seccomp(this_syscall, 0, action); + return SECCOMP_PHASE1_SKIP; +} +#endif + +/** + * seccomp_phase1() - run fast path seccomp checks on the current syscall + * @arg sd: The seccomp_data or NULL + * + * This only reads pt_regs via the syscall_xyz helpers. The only change + * it will make to pt_regs is via syscall_set_return_value, and it will + * only do that if it returns SECCOMP_PHASE1_SKIP. + * + * If sd is provided, it will not read pt_regs at all. + * + * It may also call do_exit or force a signal; these actions must be + * safe. + * + * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should + * be processed normally. + * + * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be + * invoked. In this case, seccomp_phase1 will have set the return value + * using syscall_set_return_value. + * + * If it returns anything else, then the return value should be passed + * to seccomp_phase2 from a context in which ptrace hooks are safe. + */ +u32 seccomp_phase1(struct seccomp_data *sd) +{ + int mode = current->seccomp.mode; + int this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; -#endif - do { - if (*syscall == this_syscall) - return 0; - } while (*++syscall); - exit_sig = SIGKILL; - ret = SECCOMP_RET_KILL; - break; + __secure_computing_strict(this_syscall); /* may call do_exit */ + return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER - case SECCOMP_MODE_FILTER: { - int data; - struct pt_regs *regs = task_pt_regs(current); - ret = seccomp_run_filters(this_syscall); - data = ret & SECCOMP_RET_DATA; - ret &= SECCOMP_RET_ACTION; - switch (ret) { - case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, - -data, 0); - goto skip; - case SECCOMP_RET_TRAP: - /* Show the handler the original registers. */ - syscall_rollback(current, regs); - /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); - goto skip; - case SECCOMP_RET_TRACE: - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - goto skip; - } - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - - return 0; - case SECCOMP_RET_ALLOW: - return 0; - case SECCOMP_RET_KILL: - default: - break; - } - exit_sig = SIGSYS; - break; - } + case SECCOMP_MODE_FILTER: + return __seccomp_phase1_filter(this_syscall, sd); #endif default: BUG(); } +} -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - audit_seccomp(this_syscall, exit_sig, ret); - do_exit(exit_sig); -#ifdef CONFIG_SECCOMP_FILTER -skip: - audit_seccomp(this_syscall, exit_sig, ret); -#endif - return -1; +/** + * seccomp_phase2() - finish slow path seccomp work for the current syscall + * @phase1_result: The return value from seccomp_phase1() + * + * This must be called from a context in which ptrace hooks can be used. + * + * Returns 0 if the syscall should be processed or -1 to skip the syscall. + */ +int seccomp_phase2(u32 phase1_result) +{ + struct pt_regs *regs = task_pt_regs(current); + u32 action = phase1_result & SECCOMP_RET_ACTION; + int data = phase1_result & SECCOMP_RET_DATA; + + BUG_ON(action != SECCOMP_RET_TRACE); + + audit_seccomp(syscall_get_nr(current, regs), 0, action); + + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); + return -1; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + do_exit(SIGSYS); + if (syscall_get_nr(current, regs) < 0) + return -1; /* Explicit request to skip. */ + + return 0; } +#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { @@ -473,47 +753,152 @@ long prctl_get_seccomp(void) } /** - * prctl_set_seccomp: configures current->seccomp.mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * seccomp_set_mode_strict: internal function for setting strict seccomp * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters. Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_strict(void) +{ + const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; + long ret = -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + +#ifdef TIF_NOTSC + disable_TSC(); +#endif + seccomp_assign_mode(current, seccomp_mode); + ret = 0; + +out: + spin_unlock_irq(¤t->sighand->siglock); + + return ret; +} + +#ifdef CONFIG_SECCOMP_FILTER +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags: flags to change filter behavior + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes. * * Once current->seccomp.mode is non-zero, it may not be changed. * * Returns 0 on success or -EINVAL on failure. */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) { + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; + struct seccomp_filter *prepared = NULL; long ret = -EINVAL; - if (current->seccomp.mode && - current->seccomp.mode != seccomp_mode) + /* Validate flags. */ + if (flags & ~SECCOMP_FILTER_FLAG_MASK) + return -EINVAL; + + /* Prepare the new filter before holding any locks. */ + prepared = seccomp_prepare_user_filter(filter); + if (IS_ERR(prepared)) + return PTR_ERR(prepared); + + /* + * Make sure we cannot change seccomp or nnp state via TSYNC + * while another thread is in the middle of calling exec. + */ + if (flags & SECCOMP_FILTER_FLAG_TSYNC && + mutex_lock_killable(¤t->signal->cred_guard_mutex)) + goto out_free; + + spin_lock_irq(¤t->sighand->siglock); + + if (!seccomp_may_assign_mode(seccomp_mode)) + goto out; + + ret = seccomp_attach_filter(flags, prepared); + if (ret) goto out; + /* Do not free the successfully attached filter. */ + prepared = NULL; + + seccomp_assign_mode(current, seccomp_mode); +out: + spin_unlock_irq(¤t->sighand->siglock); + if (flags & SECCOMP_FILTER_FLAG_TSYNC) + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: + seccomp_filter_free(prepared); + return ret; +} +#else +static inline long seccomp_set_mode_filter(unsigned int flags, + const char __user *filter) +{ + return -EINVAL; +} +#endif + +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, + const char __user *uargs) +{ + switch (op) { + case SECCOMP_SET_MODE_STRICT: + if (flags != 0 || uargs != NULL) + return -EINVAL; + return seccomp_set_mode_strict(); + case SECCOMP_SET_MODE_FILTER: + return seccomp_set_mode_filter(flags, uargs); + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, + const char __user *, uargs) +{ + return do_seccomp(op, flags, uargs); +} + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ + unsigned int op; + char __user *uargs; switch (seccomp_mode) { case SECCOMP_MODE_STRICT: - ret = 0; -#ifdef TIF_NOTSC - disable_TSC(); -#endif + op = SECCOMP_SET_MODE_STRICT; + /* + * Setting strict mode through prctl always ignored filter, + * so make sure it is always NULL here to pass the internal + * check in do_seccomp(). + */ + uargs = NULL; break; -#ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - ret = seccomp_attach_user_filter(filter); - if (ret) - goto out; + op = SECCOMP_SET_MODE_FILTER; + uargs = filter; break; -#endif default: - goto out; + return -EINVAL; } - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -out: - return ret; + /* prctl interface doesn't have flags, so they are always zero. */ + return do_seccomp(op, 0, uargs); } diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..d51c5ddd855c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, struct sighand_struct *sighand; for (;;) { + /* + * Disable interrupts early to avoid deadlocks. + * See rcu_read_unlock() comment header for details. + */ local_irq_save(*flags); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); @@ -1271,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, local_irq_restore(*flags); break; } - + /* + * This sighand can be already freed and even reused, but + * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * initializes ->siglock: this slab can't go away, it has + * the same object type, ->siglock can't be reinitialized. + * + * We need to ensure that tsk->sighand is still the same + * after we take the lock, we can race with de_thread() or + * __exit_signal(). In the latter case the next iteration + * must see ->sighand == NULL. + */ spin_lock(&sighand->siglock); if (likely(sighand == tsk->sighand)) { rcu_read_unlock(); @@ -1327,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) int error = -ESRCH; struct task_struct *p; - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); + for (;;) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); + if (p) + error = group_send_sig_info(sig, info, p); + rcu_read_unlock(); + if (likely(!p || error != -ESRCH)) + return error; - return error; + /* + * The task was unhashed in between, try again. If it + * is dead, pid_task() will return NULL, if we race with + * de_thread() it will find the new leader. + */ + } } int kill_proc_info(int sig, struct siginfo *info, pid_t pid) @@ -2166,8 +2178,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2237,13 +2248,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2251,13 +2262,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2307,7 +2318,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2328,7 +2339,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2338,34 +2349,32 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2375,11 +2384,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -2387,8 +2396,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* @@ -2493,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals); */ SYSCALL_DEFINE0(restart_syscall) { - struct restart_block *restart = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; return restart->fn(restart); } @@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif +#ifdef SEGV_BNDERR + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); @@ -2980,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -3029,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); @@ -3538,7 +3546,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; @@ -3551,7 +3559,7 @@ int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); set_restore_sigmask(); return -ERESTARTNOHAND; diff --git a/kernel/smp.c b/kernel/smp.c index 80c33f8de14f..07854477c164 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,7 @@ * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ +#include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> @@ -12,12 +13,13 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/sched.h> #include "smpboot.h" enum { CSD_FLAG_LOCK = 0x01, - CSD_FLAG_WAIT = 0x02, + CSD_FLAG_SYNCHRONOUS = 0x02, }; struct call_function_data { @@ -105,7 +107,7 @@ void __init call_function_init(void) */ static void csd_lock_wait(struct call_single_data *csd) { - while (csd->flags & CSD_FLAG_LOCK) + while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) cpu_relax(); } @@ -119,19 +121,17 @@ static void csd_lock(struct call_single_data *csd) * to ->flags with any subsequent assignments to other * fields of the specified call_single_data structure: */ - smp_mb(); + smp_wmb(); } static void csd_unlock(struct call_single_data *csd) { - WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ - smp_mb(); - - csd->flags &= ~CSD_FLAG_LOCK; + smp_store_release(&csd->flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); @@ -142,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, struct call_single_data *csd, - smp_call_func_t func, void *info, int wait) + smp_call_func_t func, void *info) { - struct call_single_data csd_stack = { .flags = 0 }; - unsigned long flags; - - if (cpu == smp_processor_id()) { + unsigned long flags; + + /* + * We can unlock early even for the synchronous on-stack case, + * since we're doing this from the same CPU.. + */ + csd_unlock(csd); local_irq_save(flags); func(info); local_irq_restore(flags); @@ -156,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) + if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { + csd_unlock(csd); return -ENXIO; - - - if (!csd) { - csd = &csd_stack; - if (!wait) - csd = &__get_cpu_var(csd_data); } - csd_lock(csd); - csd->func = func; csd->info = info; - if (wait) - csd->flags |= CSD_FLAG_WAIT; - /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -188,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) arch_send_call_function_single_ipi(cpu); - if (wait) - csd_lock_wait(csd); - return 0; } @@ -228,7 +218,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) WARN_ON(!irqs_disabled()); - head = &__get_cpu_var(call_single_queue); + head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); @@ -248,9 +238,26 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } llist_for_each_entry_safe(csd, csd_next, entry, llist) { - csd->func(csd->info); - csd_unlock(csd); + smp_call_func_t func = csd->func; + void *info = csd->info; + + /* Do we wait until *after* callback? */ + if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + func(info); + csd_unlock(csd); + } else { + csd_unlock(csd); + func(info); + } } + + /* + * Handle irq works queued remotely by irq_work_queue_on(). + * Smp functions above are typically synchronous so they + * better run first since some other CPUs may be busy waiting + * for them. + */ + irq_work_run(); } /* @@ -264,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { + struct call_single_data *csd; + struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; int this_cpu; int err; @@ -282,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); - err = generic_exec_single(cpu, NULL, func, info, wait); + csd = &csd_stack; + if (!wait) { + csd = this_cpu_ptr(&csd_data); + csd_lock(csd); + } + + err = generic_exec_single(cpu, csd, func, info); + + if (wait) + csd_lock_wait(csd); put_cpu(); @@ -311,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd) int err = 0; preempt_disable(); - err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); + + /* We could deadlock if we have to wait here with interrupts disabled! */ + if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK)) + csd_lock_wait(csd); + + csd->flags = CSD_FLAG_LOCK; + smp_wmb(); + + err = generic_exec_single(cpu, csd, csd->func, csd->info); preempt_enable(); return err; @@ -410,7 +436,7 @@ void smp_call_function_many(const struct cpumask *mask, return; } - cfd = &__get_cpu_var(cfd_data); + cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, cfd->cpumask); @@ -423,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask, struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); csd_lock(csd); + if (wait) + csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); @@ -661,7 +689,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } @@ -690,3 +718,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..c697f73d82d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/smp.h> +#include <linux/delay.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> @@ -110,7 +111,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +137,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } @@ -279,6 +281,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -291,6 +294,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); @@ -311,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) put_online_cpus(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); + +static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); + +/* + * Called to poll specified CPU's state, for example, when waiting for + * a CPU to come online. + */ +int cpu_report_state(int cpu) +{ + return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +} + +/* + * If CPU has died properly, set its state to CPU_UP_PREPARE and + * return success. Otherwise, return -EBUSY if the CPU died after + * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN + * if cpu_wait_death() timed out and the CPU still hasn't gotten around + * to dying. In the latter two cases, the CPU might not be set up + * properly, but it is up to the arch-specific code to decide. + * Finally, -EIO indicates an unanticipated problem. + * + * Note that it is permissible to omit this call entirely, as is + * done in architectures that do no CPU-hotplug error checking. + */ +int cpu_check_up_prepare(int cpu) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); + return 0; + } + + switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { + + case CPU_POST_DEAD: + + /* The CPU died properly, so just start it up again. */ + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); + return 0; + + case CPU_DEAD_FROZEN: + + /* + * Timeout during CPU death, so let caller know. + * The outgoing CPU completed its processing, but after + * cpu_wait_death() timed out and reported the error. The + * caller is free to proceed, in which case the state + * will be reset properly by cpu_set_state_online(). + * Proceeding despite this -EBUSY return makes sense + * for systems where the outgoing CPUs take themselves + * offline, with no post-death manipulation required from + * a surviving CPU. + */ + return -EBUSY; + + case CPU_BROKEN: + + /* + * The most likely reason we got here is that there was + * a timeout during CPU death, and the outgoing CPU never + * did complete its processing. This could happen on + * a virtualized system if the outgoing VCPU gets preempted + * for more than five seconds, and the user attempts to + * immediately online that same CPU. Trying again later + * might return -EBUSY above, hence -EAGAIN. + */ + return -EAGAIN; + + default: + + /* Should not happen. Famous last words. */ + return -EIO; + } +} + +/* + * Mark the specified CPU online. + * + * Note that it is permissible to omit this call entirely, as is + * done in architectures that do no CPU-hotplug error checking. + */ +void cpu_set_state_online(int cpu) +{ + (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Wait for the specified CPU to exit the idle loop and die. + */ +bool cpu_wait_death(unsigned int cpu, int seconds) +{ + int jf_left = seconds * HZ; + int oldstate; + bool ret = true; + int sleep_jf = 1; + + might_sleep(); + + /* The outgoing CPU will normally get done quite quickly. */ + if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) + goto update_state; + udelay(5); + + /* But if the outgoing CPU dawdles, wait increasingly long times. */ + while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { + schedule_timeout_uninterruptible(sleep_jf); + jf_left -= sleep_jf; + if (jf_left <= 0) + break; + sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); + } +update_state: + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); + if (oldstate == CPU_DEAD) { + /* Outgoing CPU died normally, update state. */ + smp_mb(); /* atomic_read() before update. */ + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); + } else { + /* Outgoing CPU still hasn't died, set state accordingly. */ + if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + oldstate, CPU_BROKEN) != oldstate) + goto update_state; + ret = false; + } + return ret; +} + +/* + * Called by the outgoing CPU to report its successful death. Return + * false if this report follows the surviving CPU's timing out. + * + * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU + * timed out. This approach allows architectures to omit calls to + * cpu_check_up_prepare() and cpu_set_state_online() without defeating + * the next cpu_wait_death()'s polling loop. + */ +bool cpu_report_death(void) +{ + int oldstate; + int newstate; + int cpu = smp_processor_id(); + + do { + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); + if (oldstate != CPU_BROKEN) + newstate = CPU_DEAD; + else + newstate = CPU_DEAD_FROZEN; + } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + oldstate, newstate) != oldstate); + return newstate == CPU_DEAD; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..479e4436f787 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if (preempt_count() == cnt) { +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); +#endif trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } } EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -278,7 +282,7 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(smp_processor_id()); + rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); @@ -485,7 +489,7 @@ static void tasklet_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_vec.head); __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); local_irq_enable(); while (list) { @@ -521,7 +525,7 @@ static void tasklet_hi_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); + __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); local_irq_enable(); while (list) { @@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(cpu); local_irq_enable(); - cond_resched(); + cond_resched_rcu_qs(); return; } local_irq_enable(); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + int i; + unsigned long ip; + int generated; + int total = 0; + + if (WARN_ON(!trace->entries)) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + ip = trace->entries[i]; + generated = snprintf(buf, size, "%*c[<%p>] %pS\n", + 1 + spaces, ' ', (void *) ip, (void *) ip); + + total += generated; + + /* Assume that generated isn't a negative number */ + if (generated >= size) { + buf += size; + size = 0; + } else { + buf += generated; + size -= generated; + } + } + + return total; +} +EXPORT_SYMBOL_GPL(snprint_stack_trace); + /* * Architectures that do not implement save_stack_trace_tsk or * save_stack_trace_regs get this weak alias and a once-per-bootup warning diff --git a/kernel/sys.c b/kernel/sys.c index 66a751ebf9d9..a4e372b798a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -62,28 +62,28 @@ #include <asm/unistd.h> #ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) +# define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) +# define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) +# define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) +# define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) +# define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) +# define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) +# define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) +# define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) @@ -91,6 +91,18 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif +#ifndef MPX_ENABLE_MANAGEMENT +# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +#endif +#ifndef MPX_DISABLE_MANAGEMENT +# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +#endif +#ifndef GET_FP_MODE +# define GET_FP_MODE(a) (-EINVAL) +#endif +#ifndef SET_FP_MODE +# define SET_FP_MODE(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -182,39 +194,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* For find_user() */ - break; + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* For find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -244,47 +257,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) + goto out_unlock; /* No processes for this user */ + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* for find_user() */ - break; + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* for find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -306,11 +320,12 @@ out_unlock: * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -364,7 +379,7 @@ error: } /* - * setgid() is implemented like SysV w/ SAVED_IDS + * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ @@ -442,7 +457,7 @@ static int set_user(struct cred *new) * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { @@ -503,17 +518,17 @@ error: abort_creds(new); return retval; } - + /* - * setuid() is implemented like SysV with SAVED_IDS - * + * setuid() is implemented like SysV with SAVED_IDS + * * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal + * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. + * regain them by swapping the real and effective uid. */ SYSCALL_DEFINE1(setuid, uid_t, uid) { @@ -637,10 +652,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(ruid, ruidp)) && - !(retval = put_user(euid, euidp))) - retval = put_user(suid, suidp); - + retval = put_user(ruid, ruidp); + if (!retval) { + retval = put_user(euid, euidp); + if (!retval) + return put_user(suid, suidp); + } return retval; } @@ -709,9 +726,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(rgid, rgidp)) && - !(retval = put_user(egid, egidp))) - retval = put_user(sgid, sgidp); + retval = put_user(rgid, rgidp); + if (!retval) { + retval = put_user(egid, egidp); + if (!retval) + retval = put_user(sgid, sgidp); + } return retval; } @@ -796,6 +816,7 @@ change_okay: commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process @@ -862,11 +883,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); @@ -1091,6 +1110,7 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -1110,7 +1130,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); @@ -1284,7 +1304,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) /* * Back compatibility for getrlimit. Needed for some apps. */ - SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { @@ -1299,7 +1318,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; + return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #endif @@ -1527,7 +1546,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; - memset((char *) r, 0, sizeof *r); + memset((char *)r, 0, sizeof (*r)); utime = stime = 0; if (who == RUSAGE_THREAD) { @@ -1541,41 +1560,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) return; switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - } while_each_thread(p, t); + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; + + if (who == RUSAGE_CHILDREN) break; - default: - BUG(); + case RUSAGE_SELF: + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); + break; + + default: + BUG(); } unlock_task_sighand(p, &flags); @@ -1585,6 +1604,7 @@ out: if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); + if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); @@ -1596,6 +1616,7 @@ out: int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -1631,6 +1652,7 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; + struct file *old_exe, *exe_file; struct inode *inode; int err; @@ -1654,20 +1676,25 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - down_write(&mm->mmap_sem); - /* * Forbid mm->exe_file change if old file still mapped. */ + exe_file = get_mm_exe_file(mm); err = -EBUSY; - if (mm->exe_file) { + if (exe_file) { struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_file && - path_equal(&vma->vm_file->f_path, - &mm->exe_file->f_path)) - goto exit_unlock; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); } /* @@ -1678,29 +1705,220 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EPERM; if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit_unlock; + goto exit; err = 0; - set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ -exit_unlock: - up_write(&mm->mmap_sem); - + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); exit: fdput(exe); return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); + down_read(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; } +#endif /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - unsigned long rlim = rlimit(RLIMIT_DATA); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -1733,9 +1951,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, + mm->end_data, mm->start_data)) goto out; mm->start_brk = addr; @@ -1745,9 +1962,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, + mm->end_data, mm->start_data)) goto out; mm->brk = addr; @@ -1990,12 +2206,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; @@ -2011,6 +2227,22 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, me->mm->def_flags &= ~VM_NOHUGEPAGE; up_write(&me->mm->mmap_sem); break; + case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = MPX_ENABLE_MANAGEMENT(me); + break; + case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = MPX_DISABLE_MANAGEMENT(me); + break; + case PR_SET_FP_MODE: + error = SET_FP_MODE(me, arg2); + break; + case PR_GET_FP_MODE: + error = GET_FP_MODE(me); + break; default: error = -EINVAL; break; @@ -2023,6 +2255,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, { int err = 0; int cpu = raw_smp_processor_id(); + if (cpup) err |= put_user(cpu, cpup); if (nodep) @@ -2135,7 +2368,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { + if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b51b5df..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); @@ -155,6 +156,23 @@ cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_uselib); +cond_syscall(sys_fadvise64); +cond_syscall(sys_fadvise64_64); +cond_syscall(sys_madvise); +cond_syscall(sys_setuid); +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_capget); +cond_syscall(sys_capset); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); @@ -165,6 +183,8 @@ cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); cond_syscall(sys_subpage_prot); +cond_syscall(sys_s390_pci_mmio_read); +cond_syscall(sys_s390_pci_mmio_write); /* mmu depending weak syscall entries */ cond_syscall(sys_mprotect); @@ -197,6 +217,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); @@ -213,3 +234,12 @@ cond_syscall(compat_sys_open_by_handle_at); /* compare kernel pointers */ cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..2082b1a88fb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,6 +19,7 @@ */ #include <linux/module.h> +#include <linux/aio.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> @@ -92,11 +93,9 @@ #include <linux/nmi.h> #endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -387,7 +386,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", @@ -622,6 +622,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_KEXEC { @@ -701,10 +708,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "threads-max", - .data = &max_threads, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_max_threads, }, { .procname = "random", @@ -838,7 +845,7 @@ static struct ctl_table kern_table[] = { .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, }, @@ -847,11 +854,33 @@ static struct ctl_table kern_table[] = { .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog, + .proc_handler = proc_watchdog_thresh, .extra1 = &zero, .extra2 = &sixty, }, { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_nmi_watchdog, + .extra1 = &zero, +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) + .extra2 = &one, +#else + .extra2 = &zero, +#endif + }, + { + .procname = "soft_watchdog", + .data = &soft_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), @@ -871,15 +900,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif /* CONFIG_SMP */ - { - .procname = "nmi_watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { @@ -1055,15 +1075,6 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif -#ifdef CONFIG_RCU_TORTURE_TEST - { - .procname = "rcutorture_runnable", - .data = &rcutorture_runnable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PERF_EVENTS /* * User-space scripts rely on the existence of this file @@ -1112,6 +1123,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -1220,6 +1240,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = &zero, + }, + { .procname = "nr_pdflush_threads", .mode = 0444 /* read-only */, .proc_handler = pdflush_proc_obsolete, @@ -1240,8 +1268,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1276,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, }, #endif { @@ -1274,8 +1298,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, }, #endif { @@ -1311,6 +1333,15 @@ static struct ctl_table vm_table[] = { .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_COMPACTION */ { @@ -1463,13 +1494,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - { - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = scan_unevictable_handler, - }, #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", @@ -1957,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = *negp ? -*lvalp : *lvalp; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } } else { int val = *valp; if (val < 0) { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, {} }; @@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, {} }; diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70263f4..875f64e8935b 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void) pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", PTR_ERR(key)); } else { + set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..21f82c29c914 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); - int rc; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return rc; - } + genlmsg_end(skb, reply); return genlmsg_reply(skb, info); } @@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb, void *reply = genlmsg_data(genlhdr); int rc, delcount = 0; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return; - } + genlmsg_end(skb, reply); rc = 0; down_read(&listeners->sem); @@ -459,7 +450,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, f.file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; @@ -638,7 +629,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) fill_tgid_exit(tsk); } - listeners = __this_cpu_ptr(&listener_array); + listeners = raw_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513a45ed..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL config GENERIC_TIME_VSYSCALL_OLD bool -# ktime_t scalar 64bit nsec representation -config KTIME_SCALAR - bool - # Old style timekeeping config ARCH_USES_GETTIMEOFFSET bool @@ -32,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET config GENERIC_CLOCKEVENTS bool -# Migration helper. Builds, but does not invoke -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS - # Architecture can handle broadcast in a driver-agnostic way config ARCH_HAS_TICK_BROADCAST bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413fd0ebf..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,14 +1,31 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) obj-y += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o -obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o -obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) + diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fe75444ae7ec..1b001ed1edb9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) return ret; } - +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev, struct class_interface *class_intf) @@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { + unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } /* Re-add periodic timers */ if (ptr->it.alarm.interval.tv64) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); - return ALARMTIMER_RESTART; + result = ALARMTIMER_RESTART; } - return ALARMTIMER_NORESTART; + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; } /** @@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @new_timer: k_itimer pointer * @cur_setting: itimerspec data to fill * - * Copies the itimerspec data out from the k_itimer + * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - memset(cur_setting, 0, sizeof(struct itimerspec)); + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); + + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); } /** @@ -776,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..11dc22a6983b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); @@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); +static int __clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + /* Transition with legacy set_mode() callback */ + if (dev->set_mode) { + /* Legacy callback doesn't support new modes */ + if (state > CLOCK_EVT_STATE_ONESHOT) + return -ENOSYS; + /* + * 'clock_event_state' and 'clock_event_mode' have 1-to-1 + * mapping until *_ONESHOT, and so a simple cast will work. + */ + dev->set_mode((enum clock_event_mode)state, dev); + dev->mode = (enum clock_event_mode)state; + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: + /* + * This is an internal state, which is guaranteed to go from + * SHUTDOWN to DETACHED. No driver interaction required. + */ + return 0; + + case CLOCK_EVT_STATE_SHUTDOWN: + return dev->set_state_shutdown(dev); + + case CLOCK_EVT_STATE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + return dev->set_state_periodic(dev); + + case CLOCK_EVT_STATE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + return dev->set_state_oneshot(dev); + + default: + return -ENOSYS; + } +} + /** - * clockevents_set_mode - set the operating mode of a clock event device + * clockevents_set_state - set the operating state of a clock event device * @dev: device to modify - * @mode: new mode + * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) +void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->mode != mode) { - dev->set_mode(mode, dev); - dev->mode = mode; + if (dev->state != state) { + if (__clockevents_set_state(dev, state)) + return; + + dev->state = state; /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (state == CLOCK_EVT_STATE_ONESHOT) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->set_mode) { + dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); + dev->mode = CLOCK_EVT_MODE_RESUME; + } else if (dev->tick_resume) { + ret = dev->tick_resume(dev); + } + + return ret; +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffie */ @@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; /* Shortcut for clockevent devices that can deal with ktime. */ @@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) continue; if (!tick_check_replacement(newdev, dev)) @@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + if (ced->state == CLOCK_EVT_STATE_DETACHED) { list_del_init(&ced->list); return 0; } @@ -371,7 +440,38 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) mutex_unlock(&clockevents_mutex); return ret; } -EXPORT_SYMBOL_GPL(clockevents_unbind); +EXPORT_SYMBOL_GPL(clockevents_unbind_device); + +/* Sanity check of state transition callbacks */ +static int clockevents_sanity_check(struct clock_event_device *dev) +{ + /* Legacy set_mode() callback */ + if (dev->set_mode) { + /* We shouldn't be supporting new modes now */ + WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || + dev->set_state_shutdown || dev->tick_resume); + + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* New state-specific callbacks */ + if (!dev->set_state_shutdown) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !dev->set_state_periodic) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && + !dev->set_state_oneshot) + return -EINVAL; + + return 0; +} /** * clockevents_register_device - register a clock event device @@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(clockevents_sanity_check(dev)); + + /* Initialize state to DETACHED */ + dev->state = CLOCK_EVT_STATE_DETACHED; + if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); @@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + if (dev->state == CLOCK_EVT_STATE_ONESHOT) return clockevents_program_event(dev, dev->next_event, false); - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + if (dev->state == CLOCK_EVT_STATE_PERIODIC) + return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev) * @old: device to release (can be NULL) * @new: device to request (can be NULL) * - * Called from the notifier chain. clockevents_lock is held already + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { - unsigned long flags; - - local_irq_save(flags); /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); - clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); clockevents_shutdown(new); } - local_irq_restore(flags); } /** @@ -541,74 +642,40 @@ void clockevents_resume(void) dev->resume(dev); } -#ifdef CONFIG_GENERIC_CLOCKEVENTS +#ifdef CONFIG_HOTPLUG_CPU /** - * clockevents_notify - notification about relevant events - * Returns 0 on success, any other value on error + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ -int clockevents_notify(unsigned long reason, void *arg) +void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); - switch (reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, arg); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - ret = tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(arg); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(arg); - tick_shutdown_broadcast(arg); - tick_shutdown(arg); - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + tick_shutdown_broadcast_oneshot(cpu); + tick_shutdown_broadcast(cpu); + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); list_del(&dev->list); - /* - * Now check whether the CPU has left unused per cpu devices - */ - cpu = *((int *)arg); - list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { - if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1 && - !tick_is_broadcast_device(dev)) { - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - list_del(&dev->list); - } } - break; - default: - break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); - return ret; } -EXPORT_SYMBOL_GPL(clockevents_notify); +#endif #ifdef CONFIG_SYSFS struct bus_type clockevents_subsys = { @@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void) } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */ - -#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502c955a..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -32,82 +32,7 @@ #include <linux/kthread.h> #include "tick-internal.h" - -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); - -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) -{ - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); +#include "timekeeping_internal.h" /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks @@ -217,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) schedule_work(&watchdog_work); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) -{ - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - __clocksource_unstable(cs); -} - /** * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable @@ -249,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow; + cycle_t csnow, wdnow, cslast, wdlast, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -282,11 +200,14 @@ static void clocksource_watchdog(unsigned long data) continue; } - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; cs->cs_last = csnow; cs->wd_last = wdnow; @@ -295,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - clocksource_unstable(cs, cs_nsec - wd_nsec); + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, csnow, cslast, cs->mask); + __clocksource_unstable(cs); continue; } @@ -543,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @shift: cycle to nanosecond divisor (power of two) * @maxadj: maximum adjustment value to mult (~11%) * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) + * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger then what the math used can handle without overflows. */ -u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the - * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) - * which is equivalent to the below. - * max_cycles < (2^63)/(mult + maxadj) - * max_cycles < 2^(log2((2^63)/(mult + maxadj))) - * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) - * max_cycles < 2^(63 - log2(mult + maxadj)) - * max_cycles < 1 << (63 - log2(mult + maxadj)) - * Please note that we add 1 to the result of the log2 to account for - * any rounding errors, ensure the above inequality is satisfied and - * no overflow will occur. + * cyc2ns() function without overflowing a 64-bit result. */ - max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); /* * The actual maximum number of cycles we can defer the clocksource is @@ -573,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) max_cycles = min(max_cycles, mask); max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + return max_nsecs; } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated * */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static inline void clocksource_update_max_deferment(struct clocksource *cs) { - u64 max_nsecs; - - max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, - cs->mask); - /* - * To ensure that the clocksource does not wrap whilst we are idle, - * limit the time the clocksource can be deferred by 12.5%. Please - * note a margin of 12.5% is used because this can be computed with - * a shift, versus say 10% which would require division. - */ - return max_nsecs - (max_nsecs >> 3); + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -722,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs) } /** - * __clocksource_updatefreq_scale - Used update clocksource with new freq + * __clocksource_update_freq_scale - Used update clocksource with new freq * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale @@ -730,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs) * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. */ -void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; + /* - * Calc the maximum number of seconds which we can run before - * wrapping around. For clocksources which have a mask > 32bit - * we need to limit the max sleep time to have a good - * conversion precision. 10 minutes is still a reasonable - * amount. That results in a shift value of 24 for a - * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. We apply the same 12.5% - * margin as we do in clocksource_max_deferment() + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. */ - sec = (cs->mask - (cs->mask >> 3)); - do_div(sec, freq); - do_div(sec, scale); - if (!sec) - sec = 1; - else if (sec > 600 && cs->mask > UINT_MAX) - sec = 600; - - clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC / scale, sec * scale); - + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } /* - * for clocksources that have large mults, to avoid overflow. - * Since mult may be adjusted by ntp, add an safety extra margin - * + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. */ cs->maxadj = clocksource_max_adjustment(cs); - while ((cs->mult + cs->maxadj < cs->mult) - || (cs->mult - cs->maxadj > cs->mult)) { + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { cs->mult >>= 1; cs->shift--; cs->maxadj = clocksource_max_adjustment(cs); } - cs->max_idle_ns = clocksource_max_deferment(cs); + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources @@ -788,9 +728,9 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { /* Initialize mult/shift and max_idle_ns */ - __clocksource_updatefreq_scale(cs, scale, freq); + __clocksource_update_freq_scale(cs, scale, freq); - /* Add clocksource to the clcoksource list */ + /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); @@ -800,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); - -/** - * clocksource_register - Used to install new clocksources - * @cs: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *cs) -{ - /* calculate max adjustment for given mult/shift */ - cs->maxadj = clocksource_max_adjustment(cs); - WARN_ONCE(cs->mult + cs->maxadj < cs->mult, - "Clocksource %s might overflow on 11%% adjustment\n", - cs->name); - - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); - - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL(clocksource_register); - static void __clocksource_change_rating(struct clocksource *cs, int rating) { list_del(&cs->list); diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c index 3ab28993f6e0..76d4bd962b19 100644 --- a/kernel/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,6 +54,8 @@ #include <trace/events/timer.h> +#include "tick-internal.h" + /* * The timer bases: * @@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - s32 tai_offset; + ktime_t xtim, mono, boot, tai; + ktime_t off_real, off_boot, off_tai; - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - tai_offset = timekeeping_get_tai_offset(); + mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); + boot = ktime_add(mono, off_boot); + xtim = ktime_add(mono, off_real); + tai = ktime_add(mono, off_tai); - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = - ktime_add(xtim, ktime_set(tai_offset, 0)); + base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; } /* @@ -264,64 +263,10 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * too large for inlining: */ #if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - /* Make sure nsec fits into long */ - if (unlikely(nsec > KTIME_SEC_MAX)) - return (ktime_t){ .tv64 = KTIME_MAX }; - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - /* * Divide a ktime value by a nanosecond value */ -u64 ktime_divns(const ktime_t kt, s64 div) +u64 __ktime_divns(const ktime_t kt, s64 div) { u64 dclc; int sft = 0; @@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } +EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* @@ -494,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next.tv64 < 0) + expires_next.tv64 = 0; + return expires_next; +} +#endif + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -542,32 +519,7 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -602,12 +554,17 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + * * Called with interrupts disabled and base->cpu_base.lock held */ static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -636,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, return 0; /* + * When the target cpu of the timer is currently executing + * hrtimer_interrupt(), then we do not touch the clock event + * device. hrtimer_interrupt() will reevaluate all clock bases + * before reprogramming the device. + */ + if (cpu_base->in_hrtirq) + return 0; + + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry * which we enforced in the hang detection. We want the system @@ -662,25 +628,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} - static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); } /* @@ -690,7 +644,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (!hrtimer_hres_active()) return; @@ -755,8 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { return 0; } @@ -964,7 +918,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) */ debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * We must preserve the CALLBACK state flag here, * otherwise we could move the timer base in @@ -1013,14 +967,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, leftmost = enqueue_hrtimer(timer, new_base); - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) - && hrtimer_enqueue_reprogram(timer, new_base)) { + if (!leftmost) { + unlock_hrtimer_base(timer, &flags); + return ret; + } + + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && + hrtimer_reprogram(timer, new_base)) { + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ if (wakeup) { /* * We need to drop cpu_base->lock to avoid a @@ -1153,30 +1118,15 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; - int i; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } + if (!hrtimer_hres_active()) + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), + ktime_get()); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1194,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; @@ -1237,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) struct hrtimer_cpu_base *cpu_base; int base = hrtimer_clockid_to_base(which_clock); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; @@ -1292,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) */ void hrtimer_interrupt(struct clock_event_device *dev) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1303,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock(&cpu_base->lock); entry_time = now = hrtimer_update_base(cpu_base); retry: - expires_next.tv64 = KTIME_MAX; + cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1341,28 +1291,20 @@ retry: * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < 0) - expires.tv64 = KTIME_MAX; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - } __run_hrtimer(timer, &basenow); } } - + /* Reevaluate the clock bases for the next expiry */ + expires_next = __hrtimer_get_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ @@ -1426,7 +1368,7 @@ static void __hrtimer_peek_ahead_timers(void) if (!hrtimer_hres_active()) return; - td = &__get_cpu_var(tick_cpu_device); + td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } @@ -1490,7 +1432,7 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1641,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; @@ -1680,6 +1622,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } @@ -1728,7 +1671,7 @@ static void migrate_hrtimers(int scpu) local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); + new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1764,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - case CPU_DYING_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); - break; case CPU_DEAD: case CPU_DEAD_FROZEN: - { - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); migrate_hrtimers(scpu); break; - } #endif default: @@ -1825,7 +1761,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } diff --git a/kernel/itimer.c b/kernel/time/itimer.c index 8d262b467573..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/time/itimer.c diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -25,7 +25,7 @@ #include <linux/module.h> #include <linux/init.h> -#include "tick-internal.h" +#include "timekeeping.h" /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on @@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, + .max_cycles = 10, }; __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); @@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies); static int __init init_jiffies_clocksource(void) { - return clocksource_register(&clocksource_jiffies); + return __clocksource_register(&clocksource_jiffies); } core_initcall(init_jiffies_clocksource); @@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; - clocksource_register(&refined_jiffies); + __clocksource_register(&refined_jiffies); return 0; } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a39515..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,7 +17,6 @@ #include <linux/module.h> #include <linux/rtc.h> -#include "tick-internal.h" #include "ntp_internal.h" /* @@ -459,6 +458,16 @@ out: return leap; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + struct timespec now; + + now = timespec64_to_timespec(now64); + return update_persistent_clock(now); +} +#endif + #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); @@ -466,7 +475,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { - struct timespec now, next; + struct timespec64 now; + struct timespec next; int fail = 1; /* @@ -485,16 +495,17 @@ static void sync_cmos_clock(struct work_struct *work) return; } - getnstimeofday(&now); + getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = now; + struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(adjust); + fail = update_persistent_clock64(adjust); #endif + #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) fail = rtc_set_ntp_time(adjust); @@ -531,7 +542,7 @@ void ntp_notify_cmos_timer(void) { } /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -554,7 +565,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) static inline void process_adjtimex_modes(struct timex *txc, - struct timespec *ts, + struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) @@ -632,6 +643,17 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + return 0; } @@ -640,7 +662,7 @@ int ntp_validate_timex(struct timex *txc) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { int result; @@ -684,7 +706,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = ts->tv_sec; + txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4ca2a4..bbd102ad9df7 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec *, const struct timespec *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..0075da74abf0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) @@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -1348,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + struct restart_block *restart_block = ¤t->restart_block; struct itimerspec it; int error; diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4265c9..31ea01f42e1f 100644 --- a/kernel/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,8 @@ #include <linux/export.h> #include <linux/hashtable.h> +#include "timekeeping.h" + /* * Management arrays for POSIX timers. Timers are now kept in static hash table * with 512 entries. @@ -634,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,5 +1,6 @@ /* - * sched_clock.c: support for extending counters to full 64-bit ns counter + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,15 +19,53 @@ #include <linux/seqlock.h> #include <linux/bitops.h> -struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; +}; + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); }; static struct hrtimer sched_clock_timer; @@ -34,12 +73,6 @@ static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, +}; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq; - - if (cd.suspended) - return cd.epoch_ns; + struct clock_read_data *rd; do { - seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); + + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq)); - cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; +} + +/* + * Updating the data required to read the clock. + * + * sched_clock() will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); } /* - * Atomically update the sched_clock epoch. + * Atomically update the sched_clock() epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data rd; + + rd = cd.read_data[0]; + + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + update_clock_read_data(&rd); } static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { update_sched_clock(); hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init sched_clock_register(u64 (*read)(void), int bits, - unsigned long rate) +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_read_data rd; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - /* calculate the mult/shift to convert counter ticks to ns. */ + /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; + + /* Calculate how many nanosecs until we risk wrapping */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); + cd.wrap_kt = ns_to_ktime(wrap); - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + rd = cd.read_data[0]; - /* update epoch for new counter and update epoch_ns from old counter*/ + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + cd.actual_read_sched_clock = read; - raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { r /= 1000000; r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate the ns resolution of this counter */ + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, new_mult, new_shift); pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - /* Enable IRQ time accounting if we have a fast enough sched_clock */ + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); @@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits, void __init sched_clock_postinit(void) { /* - * If no sched_clock function has been provided at that point, + * If no sched_clock() function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -189,29 +251,53 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of 'epoch_cyc'. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; +} + static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data[0]; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; + return 0; } static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; } static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, }; static int __init sched_clock_syscore_init(void) { register_syscore_ops(&sched_clock_ops); + return 0; } device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c new file mode 100644 index 000000000000..e622ba365a13 --- /dev/null +++ b/kernel/time/test_udelay.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + struct timespec ts1, ts2; + int time_passed; + + ktime_get_ts(&ts1); + udelay(usecs); + ktime_get_ts(&ts2); + time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec ts; + + ktime_get_ts(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", + loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { + int bc_moved; /* * We try to cancel the timer first. If the callback is on * flight on some other cpu then we let it handle it. If we @@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * restart the timer because we are in the callback, but we * can set the expiry time and let the callback return * HRTIMER_RESTART. + * + * Since we are in the idle loop at this point and because + * hrtimer_{start/cancel} functions call into tracing, + * calls to these functions must be bound within RCU_NONIDLE. */ - if (hrtimer_try_to_cancel(&bctimer) >= 0) { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? + !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : + 0); + if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); } else if (bc->bound_on == smp_processor_id()) { diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask; static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); -static int tick_broadcast_force; +static int tick_broadcast_forced; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif /* @@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * The device is in periodic mode. No reprogramming necessary: */ - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + if (dev->state == CLOCK_EVT_STATE_PERIODIC) goto unlock; /* @@ -324,49 +326,54 @@ unlock: raw_spin_unlock(&tick_broadcast_lock); } -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -static void tick_do_broadcast_on_off(unsigned long *reason) +void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; int cpu, bc_stopped; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; - bc = tick_broadcast_device.evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; if (!tick_device_is_functional(dev)) - goto out; + return; + raw_spin_lock(&tick_broadcast_lock); + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); - switch (*reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - tick_broadcast_force = 1; break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (tick_broadcast_force) + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (!tick_device_is_functional(dev)) @@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) else tick_broadcast_setup_oneshot(bc); } -out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop. - */ -void tick_broadcast_on_off(unsigned long reason, int *oncpu) -{ - if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) - printk(KERN_ERR "tick-broadcast: ignoring broadcast for " - "offline CPU #%d\n", *oncpu); - else - tick_do_broadcast_on_off(&reason); + raw_spin_unlock(&tick_broadcast_lock); } +EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off @@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) dev->event_handler = tick_handle_periodic_broadcast; } +#ifdef CONFIG_HOTPLUG_CPU /* * Remove a CPU from broadcasting */ -void tick_shutdown_broadcast(unsigned int *cpup) +void tick_shutdown_broadcast(unsigned int cpu) { struct clock_event_device *bc; unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif void tick_suspend_broadcast(void) { @@ -453,38 +448,48 @@ void tick_suspend_broadcast(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -int tick_resume_broadcast(void) +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; - int broadcast = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); - broadcast = cpumask_test_cpu(smp_processor_id(), - tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) - broadcast = tick_resume_broadcast_oneshot(bc); + tick_resume_broadcast_oneshot(bc); break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); - - return broadcast; } - #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask; @@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, { int ret; - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + if (bc->state != CLOCK_EVT_STATE_ONESHOT) + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); ret = clockevents_program_event(bc, expires, force); if (!ret) @@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, return ret; } -int tick_resume_broadcast_oneshot(struct clock_event_device *bc) +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return 0; + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -554,7 +558,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from @@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_mode(td->evtdev, - CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); } } } @@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -static void broadcast_move_bc(int deadcpu) -{ - struct clock_event_device *bc = tick_broadcast_device.evtdev; - - if (!bc || !broadcast_needs_cpu(bc, deadcpu)) - return; - /* This moves the broadcast assignment to this cpu */ - clockevents_program_event(bc, bc->next_event, 1); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -int tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; - ktime_t now; int cpu, ret = 0; + ktime_t now; /* * Periodic mode does not care about the enter/exit of power @@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) * We are called with preemtion disabled from the depth of the * idle code, so we can't be moved away. */ - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; + raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (state == TICK_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); broadcast_shutdown_local(bc, dev); @@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) } } out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock(&tick_broadcast_lock); return ret; } +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu @@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); tick_broadcast_set_event(bc, cpu, tick_next_period, 1); @@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} /* * Remove a dead CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +void tick_shutdown_broadcast_oneshot(unsigned int cpu) { unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - broadcast_move_bc(cpu); - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif /* * Check, whether the broadcast device is in one shot mode diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + if (dev->state != CLOCK_EVT_STATE_ONESHOT) return; for (;;) { /* @@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, void tick_install_replacement(struct clock_event_device *newdev) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); @@ -332,14 +332,16 @@ out_bc: tick_install_broadcast_device(newdev); } +#ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. * - * Called with interrupts disabled. + * Called with interrupts disabled. Not locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. */ -void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(void) { - if (*cpup == tick_do_timer_cpu) { + if (tick_do_timer_cpu == smp_processor_id()) { int cpu = cpumask_first(cpu_online_mask); tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : @@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; @@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup) * Prevent that the clock events layer tries to call * the set mode function! */ + dev->state = CLOCK_EVT_STATE_DETACHED; dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } } +#endif -void tick_suspend(void) +/** + * tick_suspend_local - Suspend the local tick device + * + * Called from the local cpu for freeze with interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend_local(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } -void tick_resume(void) +/** + * tick_resume_local - Resume the local tick device + * + * Called from the local CPU for unfreeze or XEN resume magic. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume_local(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - int broadcast = tick_resume_broadcast(); - - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool broadcast = tick_resume_check_broadcast(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(td->evtdev, 0); @@ -395,9 +412,87 @@ void tick_resume(void) } /** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) + timekeeping_suspend(); + else + tick_suspend_local(); + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) + timekeeping_resume(); + else + tick_resume_local(); + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} + +/** * tick_init - initialize the tick control */ void __init tick_init(void) { tick_broadcast_init(); + tick_nohz_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b19965a..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,14 +4,13 @@ #include <linux/hrtimer.h> #include <linux/tick.h> -extern seqlock_t jiffies_lock; +#include "timekeeping.h" +#include "tick-sched.h" -#define CS_NAME_LEN 32 +#ifdef CONFIG_GENERIC_CLOCKEVENTS -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD - -#define TICK_DO_TIMER_NONE -1 -#define TICK_DO_TIMER_BOOT -2 +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; @@ -21,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_handover_do_timer(int *cpup); -extern void tick_shutdown(unsigned int *cpup); +extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); -extern void clockevents_shutdown(struct clock_event_device *dev); +extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} +extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); +extern void clockevents_handle_noop(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); -/* - * NO_HZ / high resolution timer shared code - */ +/* Broadcasting support */ +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern void tick_install_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_shutdown_broadcast(unsigned int cpu); +extern void tick_suspend_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); +extern void tick_broadcast_init(void); +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_shutdown_broadcast(unsigned int cpu) { } +static inline void tick_suspend_broadcast(void) { } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } + +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ #ifdef CONFIG_TICK_ONESHOT extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -44,120 +94,46 @@ extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ + +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); -extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); -# else /* BROADCAST */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); +#else /* !(BROADCAST && ONESHOT): */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } -static inline bool tick_broadcast_oneshot_available(void) { return true; } -# endif /* !BROADCAST */ - -#else /* !ONESHOT */ -static inline -void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t nextevt) -{ - BUG(); -} -static inline void tick_resume_oneshot(void) -{ - BUG(); -} -static inline int tick_program_event(ktime_t expires, int force) -{ - return 0; -} -static inline void tick_oneshot_notify(void) { } -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } -static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) -{ - return 0; -} -static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline bool tick_broadcast_oneshot_available(void) { return false; } -#endif /* !TICK_ONESHOT */ - -/* - * Broadcasting support - */ -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern void tick_install_broadcast_device(struct clock_event_device *dev); -extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); -extern void tick_shutdown_broadcast(unsigned int *cpup); -extern void tick_suspend_broadcast(void); -extern int tick_resume_broadcast(void); -extern void tick_broadcast_init(void); -extern void -tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); -int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); - -#else /* !BROADCAST */ - -static inline void tick_install_broadcast_device(struct clock_event_device *dev) -{ -} - -static inline int tick_is_broadcast_device(struct clock_event_device *dev) -{ - return 0; -} -static inline int tick_device_uses_broadcast(struct clock_event_device *dev, - int cpu) -{ - return 0; -} -static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } -static inline void tick_shutdown_broadcast(unsigned int *cpup) { } -static inline void tick_suspend_broadcast(void) { } -static inline int tick_resume_broadcast(void) { return 0; } -static inline void tick_broadcast_init(void) { } -static inline int tick_broadcast_update_freq(struct clock_event_device *dev, - u32 freq) { return -ENODEV; } - -/* - * Set the periodic handler in non broadcast mode - */ -static inline void tick_set_periodic_handler(struct clock_event_device *dev, - int broadcast) -{ - dev->event_handler = tick_handle_periodic; -} -#endif /* !BROADCAST */ - -/* - * Check, if the device is functional or a dummy for broadcast - */ -static inline int tick_device_is_functional(struct clock_event_device *dev) -{ - return !(dev->features & CLOCK_EVT_FEAT_DUMMY); -} - -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); - +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !(BROADCAST && ONESHOT) */ + +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } #endif - -extern void do_timer(unsigned long ticks); -extern void update_wall_time(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -38,7 +38,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || @@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -34,7 +34,7 @@ /* * Per cpu nohz control structure */ -DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* * The time, when the last jiffy update happened. Protected by jiffies_lock. @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -204,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); */ void __tick_nohz_full_check(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_nohz_full_cpu(smp_processor_id())) { if (ts->tick_stopped && !is_idle_task(current)) { @@ -224,13 +225,29 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { }; /* - * Kick the current CPU if it's full dynticks in order to force it to + * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. */ void tick_nohz_full_kick(void) { - if (tick_nohz_full_cpu(smp_processor_id())) - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); +} + +/* + * Kick the CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_cpu(int cpu) +{ + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void nohz_full_kick_ipi(void *info) @@ -278,19 +295,12 @@ out: /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { - int cpu; - alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } - - cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); - } tick_nohz_full_running = true; return 1; @@ -316,25 +326,17 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, return NOTIFY_OK; } -/* - * Worst case string length in chunks of CPU range seems 2 steps - * separations: 0,2,4,6,... - * This is NR_CPUS + sizeof('\0') - */ -static char __initdata nohz_full_buf[NR_CPUS + 1]; - static int tick_nohz_init_all(void) { int err = -1; #ifdef CONFIG_NO_HZ_FULL_ALL if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); + WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; cpumask_setall(tick_nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); tick_nohz_full_running = true; #endif return err; @@ -349,12 +351,43 @@ void __init tick_nohz_init(void) return; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " + "support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + cpumask_copy(housekeeping_mask, cpu_possible_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); cpu_notifier(tick_nohz_cpu_down_callback, 0); - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); - pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); + pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", + cpumask_pr_args(tick_nohz_full_mask)); } #endif @@ -383,6 +416,11 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); +int tick_nohz_tick_stopped(void) +{ + return __this_cpu_read(tick_cpu_sched.tick_stopped); +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @@ -533,7 +571,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, ret = { .tv64 = 0 }; unsigned long rcu_delta_jiffies; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 time_delta; time_delta = timekeeping_max_deferment(); @@ -545,8 +583,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, last_jiffies = jiffies; } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || - arch_needs_cpu(cpu) || irq_work_needs_cpu()) { + if (rcu_needs_cpu(&rcu_delta_jiffies) || + arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -801,13 +839,12 @@ void tick_nohz_idle_enter(void) local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); + ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; __tick_nohz_idle_enter(ts); local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); /** * tick_nohz_irq_exit - update next tick event from interrupt exit @@ -819,7 +856,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); */ void tick_nohz_irq_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) __tick_nohz_idle_enter(ts); @@ -834,7 +871,7 @@ void tick_nohz_irq_exit(void) */ ktime_t tick_nohz_get_sleep_length(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->sleep_length; } @@ -912,7 +949,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -934,7 +971,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { @@ -947,7 +983,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) */ static void tick_nohz_handler(struct clock_event_device *dev) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); @@ -956,6 +992,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(now); tick_sched_handle(ts, regs); + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -967,7 +1007,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) */ static void tick_nohz_switch_to_nohz(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) @@ -1029,7 +1069,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) @@ -1083,6 +1123,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (regs) tick_sched_handle(ts, regs); + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; @@ -1103,7 +1147,7 @@ early_param("skew_tick", skew_tick); */ void tick_setup_sched_timer(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* @@ -1172,7 +1216,7 @@ void tick_clock_notify(void) */ void tick_oneshot_notify(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } @@ -1187,7 +1231,7 @@ void tick_oneshot_notify(void) */ int tick_check_oneshot_change(int allow_nohz) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..28b5da3e1a17 --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,74 @@ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include <linux/hrtimer.h> + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @sleep_length: Duration of the current idle sleep + * @do_timer_lst: CPU was the last one doing do_timer before going idle + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + ktime_t last_tick; + int inidle; + int tick_stopped; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + int idle_active; + ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + ktime_t sleep_length; + unsigned long last_jiffies; + unsigned long next_jiffies; + ktime_t idle_expires; + int do_timer_last; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#endif diff --git a/kernel/time.c b/kernel/time/time.c index 7c7964c33ae7..2c85b7724af4 100644 --- a/kernel/time.c +++ b/kernel/time/time.c @@ -42,6 +42,7 @@ #include <asm/unistd.h> #include "timeconst.h" +#include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some @@ -195,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } @@ -303,7 +308,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. +/* + * mktime64 - Converts date to seconds. + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * @@ -313,15 +320,10 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; @@ -331,15 +333,14 @@ mktime(const unsigned int year0, const unsigned int mon0, year -= 1; } - return ((((unsigned long) + return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } - -EXPORT_SYMBOL(mktime); +EXPORT_SYMBOL(mktime64); /** * set_normalized_timespec - set timespec sec and nsec parts and normalize @@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif /* * When we convert to jiffies then we interpret incoming values * the following way: @@ -496,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; + nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -517,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + EXPORT_SYMBOL(timespec_to_jiffies); void @@ -533,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) } EXPORT_SYMBOL(jiffies_to_timespec); -/* Same for "timeval" +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); @@ -676,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n) return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } +EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies @@ -694,6 +764,7 @@ unsigned long nsecs_to_jiffies(u64 n) { return (unsigned long)nsecs_to_jiffies64(n); } +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..511bdf2cafda 100644 --- a/kernel/timeconst.bc +++ b/kernel/time/timeconst.bc diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000000..4687b3104bae --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,112 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + cycle_t cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6aaedb8..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,63 +32,200 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) -static struct timekeeper timekeeper; +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned; + static DEFINE_RAW_SPINLOCK(timekeeper_lock); -static seqcount_t timekeeper_seq; static struct timekeeper shadow_timekeeper; +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; +static struct tk_fast tk_fast_raw ____cacheline_aligned; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -/* Flag for if there is a persistent clock on this platform */ -bool __read_mostly persistent_clock_exist = false; - static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { - tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } } -static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } -static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } -static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { - struct timespec tmp; + struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ - set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); - WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); tk->wall_to_monotonic = wtm; - set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) +{ + tk->offs_boot = ktime_add(tk->offs_boot, delta); +} + +#ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ +/* + * These simple flag variables are managed + * without locks, which is racy, but ok since + * we don't really care about being super + * precise about how many events were seen, + * just that a problem was observed. + */ +static int timekeeping_underflow_seen; +static int timekeeping_overflow_seen; + +/* last_warning is only modified under the timekeeping lock */ +static long timekeeping_last_warning; + +static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ + + cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } + + if (timekeeping_underflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_underflow_seen = 0; + } + + if (timekeeping_overflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_overflow_seen = 0; + } +} + +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { - /* Verify consistency before modifying */ - WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + cycle_t now, last, mask, max, delta; + unsigned int seq; + + /* + * Since we're called holding a seqlock, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqlock here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tkr->read(tkr->clock); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + delta = clocksource_delta(now, last, mask); + + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & mask) < (mask >> 3))) { + timekeeping_underflow_seen = 1; + delta = 0; + } - tk->total_sleep_time = t; - tk->offs_boot = timespec_to_ktime(t); + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > max)) { + timekeeping_overflow_seen = 1; + delta = tkr->clock->max_cycles; + } + + return delta; +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ } +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t cycle_now, delta; + + /* read clocksource */ + cycle_now = tkr->read(tkr->clock); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} +#endif /** * tk_setup_internals - Set up internals to use clocksource clock. @@ -107,9 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->clock; - tk->clock = clock; - tk->cycle_last = clock->cycle_last = clock->read(clock); + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.read = clock->read; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + + tk->tkr_raw.clock = clock; + tk->tkr_raw.read = clock->read; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -133,77 +277,236 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->xtime_nsec >>= -shift_change; + tk->tkr_mono.xtime_nsec >>= -shift_change; else - tk->xtime_nsec <<= shift_change; + tk->tkr_mono.xtime_nsec <<= shift_change; } - tk->shift = clock->shift; + tk->tkr_raw.xtime_nsec = 0; + + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->mult = clock->mult; + tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; + tk->ntp_err_mult = 0; } /* Timekeeper helper functions. */ #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 (*arch_gettimeoffset)(void); - -u32 get_arch_timeoffset(void) -{ - if (likely(arch_gettimeoffset)) - return arch_gettimeoffset(); - return 0; -} +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; #else -static inline u32 get_arch_timeoffset(void) { return 0; } +static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct timekeeper *tk) +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + cycle_t delta; s64 nsec; - /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + delta = timekeeping_get_delta(tkr); - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - nsec = cycle_delta * tk->mult + tk->xtime_nsec; - nsec >>= tk->shift; + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tkr: Timekeeping readout base from which we take the update + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb(); <- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[0], tkr); + * smp_wmb(); <- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[1], tkr); + * + * The reader side does: + * + * do { + * seq = tkf->seq; + * smp_rmb(); + * idx = seq & 0x01; + * now = now(tkf->base[idx]); + * smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; - s64 nsec; + struct tk_read_base *base = tkf->base; - /* read clocksource: */ - clock = tk->clock; - cycle_now = clock->read(clock); + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tkf->seq); - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + /* Update base[0] */ + memcpy(base, tkr, sizeof(*base)); - /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tkf->seq); - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + get_arch_timeoffset(); + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); +} + +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +u64 ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +u64 ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + +/* Suspend-time cycles value for halted fast timekeeper. */ +static cycle_t cycles_at_suspend; + +static cycle_t dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + struct tk_read_base *tkr = &tk->tkr_mono; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tkr->read(tkr->clock); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); +} + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ + struct timespec xt, wm; + + xt = timespec64_to_timespec(tk_xtime(tk)); + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, + tk->tkr_mono.cycle_last); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; } +#else +#define old_vsyscall_fixup(tk) +#endif static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -217,7 +520,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; int ret; @@ -247,6 +550,39 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + u64 seconds; + u32 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + + /* Update the monotonic raw base */ + tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; +} + /* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { @@ -254,11 +590,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } + + tk_update_ktime_data(tk); + update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); + + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** @@ -270,49 +613,49 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; + struct clocksource *clock = tk->tkr_mono.clock; + cycle_t cycle_now, delta; s64 nsec; - clock = tk->clock; - cycle_now = clock->read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - tk->cycle_last = clock->cycle_last = cycle_now; + cycle_now = tk->tkr_mono.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; - tk->xtime_nsec += cycle_delta * tk->mult; + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; + tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&tk->raw_time, nsec); + nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); + timespec64_add_ns(&tk->raw_time, nsec); } /** - * __getnstimeofday - Returns the time of day in a timespec. + * __getnstimeofday64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Updates the time of day in the timespec. * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -int __getnstimeofday(struct timespec *ts) +int __getnstimeofday64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs = 0; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(&tk->tkr_mono); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); + timespec64_add_ns(ts, nsecs); /* * Do not bail out early, in case there were callers still using @@ -322,116 +665,186 @@ int __getnstimeofday(struct timespec *ts) return -EAGAIN; return 0; } -EXPORT_SYMBOL(__getnstimeofday); +EXPORT_SYMBOL(__getnstimeofday64); /** - * getnstimeofday - Returns the time of day in a timespec. - * @ts: pointer to the timespec to be set + * getnstimeofday64 - Returns the time of day in a timespec64. + * @ts: pointer to the timespec64 to be set * - * Returns the time of day in a timespec (WARN if suspended). + * Returns the time of day in a timespec64 (WARN if suspended). */ -void getnstimeofday(struct timespec *ts) +void getnstimeofday64(struct timespec64 *ts) { - WARN_ON(__getnstimeofday(ts)); + WARN_ON(__getnstimeofday64(ts)); } -EXPORT_SYMBOL(getnstimeofday); +EXPORT_SYMBOL(getnstimeofday64); ktime_t ktime_get(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - s64 secs, nsecs; + ktime_t base; + s64 nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); - secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); - } while (read_seqcount_retry(&timekeeper_seq, seq)); - /* - * Use ktime_set/ktime_add_ns to create a proper ktime on - * 32-bit architectures without CONFIG_KTIME_SCALAR. - */ - return ktime_add_ns(ktime_set(secs, 0), nsecs); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); + +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); + /** - * ktime_get_ts - get the monotonic clock in timespec format + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use + */ +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) +{ + ktime_t *offset = offsets[offs]; + unsigned long seq; + ktime_t tconv; + + do { + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); + +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); + +/** + * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. + * in normalized timespec64 format in the variable pointed to by @ts. */ -void ktime_get_ts(struct timespec *ts) +void ktime_get_ts64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; - struct timespec tomono; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec); + timespec64_add_ns(ts, nsec + tomono.tv_nsec); } -EXPORT_SYMBOL_GPL(ktime_get_ts); - +EXPORT_SYMBOL_GPL(ktime_get_ts64); /** - * timekeeping_clocktai - Returns the TAI time of day in a timespec - * @ts: pointer to the timespec to be set + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * - * Returns the time of day in a timespec. + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. */ -void timekeeping_clocktai(struct timespec *ts) +time64_t ktime_get_seconds(void) { - struct timekeeper *tk = &timekeeper; - unsigned long seq; - u64 nsecs; + struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&timekeeper_seq); - - ts->tv_sec = tk->xtime_sec + tk->tai_offset; - nsecs = timekeeping_get_ns(tk); - - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - ts->tv_nsec = 0; - timespec_add_ns(ts, nsecs); - + return tk->ktime_sec; } -EXPORT_SYMBOL(timekeeping_clocktai); - +EXPORT_SYMBOL_GPL(ktime_get_seconds); /** - * ktime_get_clocktai - Returns the TAI time of day in a ktime + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * - * Returns the time of day in a ktime. + * Returns the wall clock seconds since 1970. This replaces the + * get_seconds() interface which is not y2038 safe on 32bit systems. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. */ -ktime_t ktime_get_clocktai(void) +time64_t ktime_get_real_seconds(void) { - struct timespec ts; + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); - timekeeping_clocktai(&ts); - return timespec_to_ktime(ts); + return seconds; } -EXPORT_SYMBOL(ktime_get_clocktai); +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); #ifdef CONFIG_NTP_PPS @@ -446,23 +859,23 @@ EXPORT_SYMBOL(ktime_get_clocktai); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = tk->raw_time; + *ts_raw = timespec64_to_timespec(tk->raw_time); ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(tk); + nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); + nsecs_real = timekeeping_get_ns(&tk->tkr_mono); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -479,45 +892,45 @@ EXPORT_SYMBOL(getnstime_raw_and_real); */ void do_gettimeofday(struct timeval *tv) { - struct timespec now; + struct timespec64 now; - getnstimeofday(&now); + getnstimeofday64(&now); tv->tv_sec = now.tv_sec; tv->tv_usec = now.tv_nsec/1000; } EXPORT_SYMBOL(do_gettimeofday); /** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(const struct timespec *tv) +int do_settimeofday64(const struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; - struct timespec ts_delta, xt; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt; unsigned long flags; - if (!timespec_valid_strict(tv)) + if (!timespec64_valid_strict(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; + ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(tk, tv); + tk_set_xtime(tk, ts); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -525,7 +938,7 @@ int do_settimeofday(const struct timespec *tv) return 0; } -EXPORT_SYMBOL(do_settimeofday); +EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. @@ -535,33 +948,35 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec tmp; + struct timespec64 ts64, tmp; int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; + ts64 = timespec_to_timespec64(*ts); + raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid_strict(&tmp)) { + tmp = timespec64_add(tk_xtime(tk), ts64); + if (!timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, ts); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); + tk_xtime_add(tk, &ts64); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ @@ -578,14 +993,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ s32 timekeeping_get_tai_offset(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; s32 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); ret = tk->tai_offset; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -606,14 +1021,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) */ void timekeeping_set_tai_offset(s32 tai_offset) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __timekeeping_set_tai_offset(tk, tai_offset); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); } @@ -625,14 +1040,14 @@ void timekeeping_set_tai_offset(s32 tai_offset) */ static int change_clocksource(void *data) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* @@ -641,7 +1056,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->clock; + old = tk->tkr_mono.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -652,7 +1067,7 @@ static int change_clocksource(void *data) } timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; @@ -667,68 +1082,56 @@ static int change_clocksource(void *data) */ int timekeeping_notify(struct clocksource *clock) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - if (tk->clock == clock) + if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->clock == clock ? 0 : -1; -} - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); + return tk->tkr_mono.clock == clock ? 0 : -1; } -EXPORT_SYMBOL_GPL(ktime_get_real); /** - * getrawmonotonic - Returns the raw monotonic time in a timespec - * @ts: pointer to the timespec to be set + * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic(struct timespec *ts) +void getrawmonotonic64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts64; unsigned long seq; s64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); - nsecs = timekeeping_get_ns_raw(tk); - *ts = tk->raw_time; + seq = read_seqcount_begin(&tk_core.seq); + nsecs = timekeeping_get_ns(&tk->tkr_raw); + ts64 = tk->raw_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts, nsecs); + timespec64_add_ns(&ts64, nsecs); + *ts = ts64; } -EXPORT_SYMBOL(getrawmonotonic); +EXPORT_SYMBOL(getrawmonotonic64); + /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; int ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -738,16 +1141,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - ret = tk->clock->max_idle_ns; + ret = tk->tkr_mono.clock->max_idle_ns; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } @@ -767,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_persistent_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_persistent_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + /** * read_boot_clock - Return time of the system start. * @@ -782,28 +1193,41 @@ void __weak read_boot_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_boot_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_boot_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + +/* Flag for if timekeeping_resume() has injected sleeptime */ +static bool sleeptime_injected; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot, tmp; - - read_persistent_clock(&now); + struct timespec64 now, boot, tmp; - if (!timespec_valid_strict(&now)) { + read_persistent_clock64(&now); + if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; now.tv_nsec = 0; } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; - read_boot_clock(&boot); - if (!timespec_valid_strict(&boot)) { + read_boot_clock64(&boot); + if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -811,7 +1235,7 @@ void __init timekeeping_init(void) } raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); ntp_init(); clock = clocksource_default_clock(); @@ -825,21 +1249,17 @@ void __init timekeeping_init(void) if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); - set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - tmp.tv_sec = 0; - tmp.tv_nsec = 0; - tk_set_sleep_time(tk, tmp); - - memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } -/* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; +/* time in seconds when suspend began for persistent clock */ +static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval @@ -849,44 +1269,74 @@ static struct timespec timekeeping_suspend_time; * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - struct timespec *delta) + struct timespec64 *delta) { - if (!timespec_valid_strict(delta)) { + if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); - tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /** - * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values - * @delta: pointer to a timespec delta value + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return sleeptime_injected; +} + +/** + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). * - * This hook is for architectures that cannot support read_persistent_clock + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + +/** + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value + * + * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ -void timekeeping_inject_sleeptime(struct timespec *delta) +void timekeeping_inject_sleeptime64(struct timespec64 *delta) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - /* - * Make sure we don't set the clock twice, as timekeeping_resume() - * already did it - */ - if (has_persistent_clock()) - return; - raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); @@ -894,36 +1344,33 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); } +#endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. */ -static void timekeeping_resume(void) +void timekeeping_resume(void) { - struct timekeeper *tk = &timekeeper; - struct clocksource *clock = tk->clock; + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; - struct timespec ts_new, ts_delta; + struct timespec64 ts_new, ts_delta; cycle_t cycle_now, cycle_delta; - bool suspendtime_found = false; - read_persistent_clock(&ts_new); + sleeptime_injected = false; + read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and @@ -937,15 +1384,16 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = clock->read(clock); + cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > clock->cycle_last) { + cycle_now > tk->tkr_mono.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -960,40 +1408,40 @@ static void timekeeping_resume(void) } nsec += ((u64) cycle_delta * mult) >> shift; - ts_delta = ns_to_timespec(nsec); - suspendtime_found = true; - } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { - ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); - suspendtime_found = true; + ts_delta = ns_to_timespec64(nsec); + sleeptime_injected = true; + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); + sleeptime_injected = true; } - if (suspendtime_found) + if (sleeptime_injected) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->cycle_last = clock->cycle_last = cycle_now; + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ + tick_resume(); hrtimers_resume(); } -static int timekeeping_suspend(void) +int timekeeping_suspend(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec delta, delta_delta; - static struct timespec old_delta; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; - read_persistent_clock(&timekeeping_suspend_time); + read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at @@ -1001,38 +1449,41 @@ static int timekeeping_suspend(void) * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; - /* - * To avoid drift caused by repeated suspend/resumes, - * which each can add ~1 second drift error, - * try to compensate so the difference in system time - * and persistent_clock time stays close to constant. - */ - delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec_sub(delta, old_delta); - if (abs(delta_delta.tv_sec) >= 2) { + if (persistent_clock_exists) { /* - * if delta_delta is too large, assume time correction - * has occured and set old_delta to the current delta. + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. */ - old_delta = delta; - } else { - /* Otherwise try to adjust old_system to compensate */ - timekeeping_suspend_time = - timespec_add(timekeeping_suspend_time, delta_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } } timekeeping_update(tk, TK_MIRROR); - write_seqcount_end(&timekeeper_seq); + halt_fast_timekeeper(tk); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + tick_suspend(); clocksource_suspend(); clockevents_suspend(); @@ -1050,125 +1501,34 @@ static int __init timekeeping_init_ops(void) register_syscore_ops(&timekeeping_syscore_ops); return 0; } - device_initcall(timekeeping_init_ops); /* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. + * Apply a multiplier adjustment to the timekeeper */ -static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, - s64 error, s64 *interval, - s64 *offset) +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + bool negative, + int adj_scale) { - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; + s64 interval = tk->cycle_interval; + s32 mult_adj = 1; - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); - tick_error -= tk->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; + if (negative) { + mult_adj = -mult_adj; + interval = -interval; + offset = -offset; } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - s64 error, interval = tk->cycle_interval; - int adj; + mult_adj <<= adj_scale; + interval <<= adj_scale; + offset <<= adj_scale; /* - * The point of this is to check if the error is greater than half - * an interval. - * - * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. - * - * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is - * larger than half an interval. - * - * Note: It does not "save" on aggravation when reading the code. - */ - error = tk->ntp_error >> (tk->ntp_error_shift - 1); - if (error > interval) { - /* - * We now divide error by 4(via shift), which checks if - * the error is greater than twice the interval. - * If it is greater, we need a bigadjust, if its smaller, - * we can adjust by 1. - */ - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } else { - if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else { - adj = timekeeping_bigadjust(tk, error, &interval, &offset); - } - } else { - goto out_adjust; - } - } - - if (unlikely(tk->clock->maxadj && - (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_deferred_once(KERN_WARNING - "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->clock->name, (long)tk->mult + adj, - (long)tk->clock->mult + tk->clock->maxadj); - } - /* * So the following can be confusing. * - * To keep things simple, lets assume adj == 1 for now. + * To keep things simple, lets assume mult_adj == 1 for now. * - * When adj != 1, remember that the interval and offset values + * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier @@ -1212,12 +1572,85 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - tk->mult += adj; + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + + tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; - tk->xtime_nsec -= offset; + tk->tkr_mono.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, + s64 offset) +{ + s64 interval = tk->cycle_interval; + s64 xinterval = tk->xtime_interval; + s64 tick_error; + bool negative; + u32 adj; + + /* Remove any current error adj from freq calculation */ + if (tk->ntp_err_mult) + xinterval -= tk->cycle_interval; + + tk->ntp_tick = ntp_tick_length(); + + /* Calculate current error per tick */ + tick_error = ntp_tick_length() >> tk->ntp_error_shift; + tick_error -= (xinterval + tk->xtime_remainder); + + /* Don't worry about correcting it if its small */ + if (likely((tick_error >= 0) && (tick_error <= interval))) + return; + + /* preserve the direction of correction */ + negative = (tick_error < 0); + + /* Sort out the magnitude of the correction */ + tick_error = abs(tick_error); + for (adj = 0; tick_error > interval; adj++) + tick_error >>= 1; + + /* scale the corrections */ + timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + /* Correct for the current frequency error */ + timekeeping_freqadjust(tk, offset); + + /* Next make a small adjustment to fix any cumulative error */ + if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { + tk->ntp_err_mult = 1; + timekeeping_apply_adjustment(tk, offset, 0, 0); + } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { + /* Undo any existing error adjustment */ + timekeeping_apply_adjustment(tk, offset, 1, 0); + tk->ntp_err_mult = 0; + } + + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); + } -out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1232,12 +1665,11 @@ out_adjust: * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->xtime_nsec < 0)) { - s64 neg = -(s64)tk->xtime_nsec; - tk->xtime_nsec = 0; + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr_mono.xtime_nsec; + tk->tkr_mono.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } - } /** @@ -1250,26 +1682,26 @@ out_adjust: */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; - while (tk->xtime_nsec >= nsecps) { + while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; - tk->xtime_nsec -= nsecps; + tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { - struct timespec ts; + struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, - timespec_sub(tk->wall_to_monotonic, ts)); + timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); @@ -1301,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->cycle_last += interval; + tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; - tk->xtime_nsec += tk->xtime_interval << shift; + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1317,48 +1750,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ - s64 remainder; - - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD - * users are removed, this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; -} -#else -#define old_vsyscall_fixup(tk) -#endif - - - /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { - struct clocksource *clock; - struct timekeeper *real_tk = &timekeeper; + struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; @@ -1371,18 +1776,20 @@ void update_wall_time(void) if (unlikely(timekeeping_suspended)) goto out; - clock = real_tk->clock; - #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval) goto out; + /* Do some additional sanity checking */ + timekeeping_check_update(real_tk, offset); + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, @@ -1418,9 +1825,7 @@ void update_wall_time(void) */ clock_set |= accumulate_nsecs_to_secs(tk); - write_seqcount_begin(&timekeeper_seq); - /* Update clock->cycle_last with the new value */ - clock->cycle_last = tk->cycle_last; + write_seqcount_begin(&tk_core.seq); /* * Update the real timekeeper. * @@ -1428,12 +1833,12 @@ void update_wall_time(void) * requires changes to all other timekeeper usage sites as * well, i.e. move the timekeeper pointer getter into the * spinlocked/seqcount protected sections. And we trade this - * memcpy under the timekeeper_seq against one before we start + * memcpy under the tk_core.seq against one before we start * updating. */ memcpy(real_tk, tk, sizeof(*tk)); timekeeping_update(real_tk, clock_set); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) @@ -1442,95 +1847,28 @@ out: } /** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set * - * Returns the wall-time of boot in a timespec. + * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ -void getboottime(struct timespec *ts) +void getboottime64(struct timespec64 *ts) { - struct timekeeper *tk = &timekeeper; - struct timespec boottime = { - .tv_sec = tk->wall_to_monotonic.tv_sec + - tk->total_sleep_time.tv_sec, - .tv_nsec = tk->wall_to_monotonic.tv_nsec + - tk->total_sleep_time.tv_nsec - }; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); + *ts = ktime_to_timespec64(t); } -EXPORT_SYMBOL_GPL(getboottime); - -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts: pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; - struct timespec tomono, sleep; - s64 nsec; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&timekeeper_seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(tk); - tomono = tk->wall_to_monotonic; - sleep = tk->total_sleep_time; - - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - ts->tv_sec += tomono.tv_sec + sleep.tv_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - -/** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ - struct timespec ts; - - get_monotonic_boottime(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - struct timekeeper *tk = &timekeeper; - - *ts = timespec_add(*ts, tk->total_sleep_time); -} -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); +EXPORT_SYMBOL_GPL(getboottime64); unsigned long get_seconds(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } @@ -1538,42 +1876,43 @@ EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; - return tk_xtime(tk); + return timespec64_to_timespec(tk_xtime(tk)); } struct timespec current_kernel_time(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - return now; + return timespec64_to_timespec(now); } EXPORT_SYMBOL(current_kernel_time); -struct timespec get_monotonic_coarse(void) +struct timespec64 get_monotonic_coarse64(void) { - struct timekeeper *tk = &timekeeper; - struct timespec now, mono; + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; unsigned long seq; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); + return now; } @@ -1587,29 +1926,38 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - * and sleep offsets. - * @xtim: pointer to timespec to be set with xtime - * @wtom: pointer to timespec to be set with wall_to_monotonic - * @sleep: pointer to timespec to be set with time in suspend + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, - struct timespec *wtom, struct timespec *sleep) +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - unsigned long seq; + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); - *xtim = tk_xtime(tk); - *wtom = tk->wall_to_monotonic; - *sleep = tk->total_sleep_time; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr_mono.base; + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); } #ifdef CONFIG_HIGH_RES_TIMERS /** - * ktime_get_update_offsets - hrtimer helper + * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset @@ -1617,57 +1965,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { - struct timekeeper *tk = &timekeeper; - ktime_t now; + struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; - u64 secs, nsecs; + ktime_t base; + u64 nsecs; do { - seq = read_seqcount_begin(&timekeeper_seq); + seq = read_seqcount_begin(&tk_core.seq); - secs = tk->xtime_sec; - nsecs = timekeeping_get_ns(tk); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&timekeeper_seq, seq)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - now = ktime_add_ns(ktime_set(secs, 0), nsecs); - now = ktime_sub(now, *offs_real); - return now; + return ktime_add_ns(base, nsecs); } #endif /** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ - struct timekeeper *tk = &timekeeper; - unsigned long seq; - struct timespec wtom; - - do { - seq = read_seqcount_begin(&timekeeper_seq); - wtom = tk->wall_to_monotonic; - } while (read_seqcount_retry(&timekeeper_seq, seq)); - - return timespec_to_ktime(wtom); -} -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - -/** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) { - struct timekeeper *tk = &timekeeper; + struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec ts; + struct timespec64 ts; s32 orig_tai, tai; int ret; @@ -1687,10 +2015,10 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday(&ts); + getnstimeofday64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); @@ -1699,7 +2027,7 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (tai != orig_tai) @@ -1719,11 +2047,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); - write_seqcount_begin(&timekeeper_seq); + write_seqcount_begin(&tk_core.seq); __hardpps(phase_ts, raw_ts); - write_seqcount_end(&timekeeper_seq); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000000..ead8794b9a4e --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,29 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97558df..f6bd65236712 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void) } late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec *t) +void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; } diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea08ffa..4ea005a7f9da 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -3,12 +3,27 @@ /* * timekeeping debug functions */ +#include <linux/clocksource.h> #include <linux/time.h> #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec *t); +extern void tk_debug_account_sleep_time(struct timespec64 *t); #else #define tk_debug_account_sleep_time(x) #endif +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + cycle_t ret = (now - last) & mask; + + return (s64) ret > 0 ? ret : 0; +} +#else +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + return (now - last) & mask; +} +#endif + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/timer.c b/kernel/time/timer.c index 3bb01a323b2a..2ece3aa5069c 100644 --- a/kernel/timer.c +++ b/kernel/time/timer.c @@ -82,6 +82,7 @@ struct tvec_base { unsigned long next_timer; unsigned long active_timers; unsigned long all_timers; + int cpu; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -89,8 +90,18 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; +/* + * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've + * made NULL special, hint: lock_timer_base()) and we cannot get a compile time + * pointer to per-cpu entries because we don't know where we'll map the section, + * even for the boot cpu. + * + * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the + * rest of them. + */ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); + static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ @@ -409,6 +420,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; } base->all_timers++; + + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); } #ifdef CONFIG_TIMER_STATS @@ -638,7 +665,7 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + struct tvec_base *base = raw_cpu_read(tvec_bases); timer->entry.next = NULL; timer->base = (void *)((unsigned long)base | flags); @@ -948,22 +975,6 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_set_base(timer, base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); - /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. - */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) - wake_up_nohz_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1026,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); + /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1376,15 +1389,14 @@ unsigned long get_next_timer_interrupt(unsigned long now) void update_process_times(int user_tick) { struct task_struct *p = current; - int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) - irq_work_run(); + irq_work_tick(); #endif scheduler_tick(); run_posix_cpu_timers(p); @@ -1532,63 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kzalloc_node(sizeof(*base), GFP_KERNEL, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ - if (WARN_ON(base != tbase_get_base(base))) { - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - spin_lock_init(&base->lock); - tvec_base_done[cpu] = 1; - } else { - base = per_cpu(tvec_bases, cpu); - } - - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; - base->active_timers = 0; - base->all_timers = 0; - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) { @@ -1630,55 +1585,86 @@ static void migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } + old_base->active_timers = 0; + old_base->all_timers = 0; + spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } -#endif /* CONFIG_HOTPLUG_CPU */ static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - long cpu = (long)hcpu; - int err; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = init_timers_cpu(cpu); - if (err < 0) - return notifier_from_errno(err); - break; -#ifdef CONFIG_HOTPLUG_CPU + switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers(cpu); + migrate_timers((long)hcpu); break; -#endif default: break; } + return NOTIFY_OK; } -static struct notifier_block timers_nb = { - .notifier_call = timer_cpu_notify, -}; +static inline void timer_register_cpu_notifier(void) +{ + cpu_notifier(timer_cpu_notify, 0); +} +#else +static inline void timer_register_cpu_notifier(void) { } +#endif /* CONFIG_HOTPLUG_CPU */ +static void __init init_timer_cpu(struct tvec_base *base, int cpu) +{ + int j; -void __init init_timers(void) + BUG_ON(base != tbase_get_base(base)); + + base->cpu = cpu; + per_cpu(tvec_bases, cpu) = base; + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; +} + +static void __init init_timer_cpus(void) { - int err; + struct tvec_base *base; + int local_cpu = smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == local_cpu) + base = &boot_tvec_bases; +#ifdef CONFIG_SMP + else + base = per_cpu_ptr(&__tvec_bases, cpu); +#endif + init_timer_cpu(base, cpu); + } +} + +void __init init_timers(void) +{ /* ensure there are enough low bits for flags in timer->base pointer */ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - BUG_ON(err != NOTIFY_OK); - + init_timer_cpus(); init_timer_stats(); - register_cpu_notifier(&timers_nb); + timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,10 +16,10 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> -#include <linux/tick.h> #include <asm/uaccess.h> +#include "tick-internal.h" struct timer_list_iter { int cpu; @@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); - SEQ_printf(m, "\n"); + if (dev->set_mode) { + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + } else { + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } + + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); + } + } SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511cca48..dd70993c266c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. */ -char *torture_onoff_stats(char *page) +void torture_onoff_stats(void) { #ifdef CONFIG_HOTPLUG_CPU - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return page; } EXPORT_SYMBOL_GPL(torture_onoff_stats); @@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); * * This must be called before the caller starts shutting down its own * kthreads. + * + * Both torture_cleanup_begin() and torture_cleanup_end() must be paired, + * in order to correctly perform the cleanup. They are separated because + * threads can still need to reference the torture_type type, thus nullify + * only after completing all other relevant calls. */ -bool torture_cleanup(void) +bool torture_cleanup_begin(void) { mutex_lock(&fullstop_mutex); if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { @@ -651,12 +654,17 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup_begin); + +void torture_cleanup_end(void) +{ mutex_lock(&fullstop_mutex); torture_type = NULL; mutex_unlock(&fullstop_mutex); - return false; } -EXPORT_SYMBOL_GPL(torture_cleanup); +EXPORT_SYMBOL_GPL(torture_cleanup_end); /* * Is it time for the current torture test to stop? @@ -708,7 +716,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, s); + *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); VERBOSE_TOROUT_ERRSTRING(f); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4409356f40d..3b9a48ae153a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_TRACE_MCOUNT_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help @@ -437,6 +432,14 @@ config UPROBE_EVENT This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + config PROBE_EVENTS def_bool n @@ -604,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config TRACE_ENUM_MAP_FILE + bool "Show enum mappings for trace events" + depends on TRACING + help + The "print fmt" of the trace events will show the enum names instead + of their values. This can cause problems for user space tools that + use this string to parse the raw data as user space does not know + how to convert the string to its value. + + To fix this, there's a special macro in the kernel that can be used + to convert the enum into its value. If this macro is used, then the + print fmt strings will have the enums converted to their values. + + If something does not get converted properly, this option can be + used to show what enums the kernel tried to convert. + + This option is for debugging the enum conversions. A file is created + in the tracing directory called "enum_map" that will show the enum + names matched with their values and what trace event system they + belong too. + + Normally, the mapping of the strings to values will be freed after + boot up or module load. With this option, they will not be freed, as + they are needed for the "enum_map" file. Enabling this option will + increase the memory footprint of the running kernel. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2611613f14f1..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,11 +3,11 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation -CFLAGS_trace_selftest_dynamic.o = -pg +CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) obj-y += trace_selftest_dynamic.o endif endif @@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o @@ -52,9 +53,10 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o -ifeq ($(CONFIG_PM_RUNTIME),y) +ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o endif ifeq ($(CONFIG_TRACING),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ada2a04..483cecfa5c17 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); -static int blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, nsec_rem, iter->ent->pid, act, rwbs); + trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); } -static int blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { const unsigned char *pdu_buf; int pdu_len; - int i, end, ret; + int i, end; pdu_buf = pdu_start(ent); pdu_len = te_blk_io_trace(ent)->pdu_len; if (!pdu_len) - return 1; + return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) @@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) break; end++; - if (!trace_seq_putc(s, '(')) - return 0; + trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { - ret = trace_seq_printf(s, "%s%02x", - i == 0 ? "" : " ", pdu_buf[i]); - if (!ret) - return ret; + trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeroes and indicate so * with a ".." appended */ - if (i == end && end != pdu_len - 1) - return trace_seq_puts(s, " ..) "); + if (i == end && end != pdu_len - 1) { + trace_seq_puts(s, " ..) "); + return; + } } - return trace_seq_puts(s, ") "); + trace_seq_puts(s, ") "); } -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = trace_seq_printf(s, "%u ", t_bytes(ent)); - if (!ret) - return 0; - ret = blk_log_dump_pdu(s, ent); - if (!ret) - return 0; - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "%u ", t_bytes(ent)); + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", + trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + else + trace_seq_printf(s, "[%s]\n", cmd); } } -static int blk_log_with_error(struct trace_seq *s, +static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = blk_log_dump_pdu(s, ent); - if (ret) - return trace_seq_printf(s, "[%d]\n", t_error(ent)); - return 0; + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", - t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", - t_sector(ent), t_error(ent)); + trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + else + trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); } } -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "[%s]\n", cmd); } -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); } -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), cmd); } -static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) { - int ret; const struct blk_io_trace *t = te_blk_io_trace(ent); - ret = trace_seq_putmem(s, t + 1, t->pdu_len); - if (ret) - return trace_seq_putc(s, '\n'); - return ret; + trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putc(s, '\n'); } /* @@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; - int ret; bool long_act; blk_log_action_t *log_action; @@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { - ret = log_action(iter, long_act ? "message" : "m"); - if (ret) - ret = blk_log_msg(s, iter->ent); - goto out; + log_action(iter, long_act ? "message" : "m"); + blk_log_msg(s, iter->ent); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Unknown action %x\n", what); + trace_seq_printf(s, "Unknown action %x\n", what); else { - ret = log_action(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act]); + what2act[what].print(s, iter->ent); } -out: - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, @@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, return print_one_line(iter, false); } -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; @@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) .time = iter->ts, }; - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); + trace_seq_putmem(s, &old, offset); + trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + blk_trace_synthesize_old_trace(iter); + + return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) @@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..2d56ce501632 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,222 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac9d1dad630b..02bece4a99ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,7 +18,7 @@ #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/suspend.h> -#include <linux/debugfs.h> +#include <linux/tracefs.h> #include <linux/hardirq.h> #include <linux/kthread.h> #include <linux/uaccess.h> @@ -65,24 +65,27 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE -#define INIT_REGEX_LOCK(opsname) \ - .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else -#define INIT_REGEX_LOCK(opsname) +#define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) }; /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* Quick disabling of function tracer. */ -int function_trace_stop __read_mostly; - /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ @@ -110,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); + #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); @@ -143,7 +149,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { - mutex_init(&ops->regex_lock); + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif @@ -242,23 +249,42 @@ static void update_function_graph_func(void); static inline void update_function_graph_func(void) { } #endif + +static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + return ftrace_ops_get_func(ops); +} + static void update_ftrace_function(void) { ftrace_func_t func; /* + * Prepare the ftrace_ops that the arch callback will use. + * If there's only one ftrace_ops registered, the ftrace_ops_list + * will point to the ops we want. + */ + set_function_trace_op = ftrace_ops_list; + + /* If there's no ftrace_ops registered, just call the stub function */ + if (ftrace_ops_list == &ftrace_list_end) { + func = ftrace_stub; + + /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && - !FTRACE_FORCE_LIST_FUNC)) { - /* Set the ftrace_ops that the arch callback uses */ - set_function_trace_op = ftrace_ops_list; - func = ftrace_ops_list->func; + } else if (ftrace_ops_list->next == &ftrace_list_end) { + func = ftrace_ops_get_list_func(ftrace_ops_list); + } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; @@ -374,6 +400,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, return ret; } +static void ftrace_update_trampoline(struct ftrace_ops *ops); + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_DELETED) @@ -403,9 +431,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); + /* The control_ops needs the trampoline update */ + ops = &control_ops; } else add_ftrace_ops(&ftrace_ops_list, ops); + ftrace_update_trampoline(ops); + if (ftrace_enabled) update_ftrace_function(); @@ -552,13 +584,13 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function " - "Hit Time Avg s^2\n" - " -------- " - "--- ---- --- ---\n"); + seq_puts(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); #else - seq_printf(m, " Function Hit\n" - " -------- ---\n"); + seq_puts(m, " Function Hit\n" + " -------- ---\n"); #endif return 0; } @@ -585,7 +617,7 @@ static int function_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " "); + seq_puts(m, " "); avg = rec->time; do_div(avg, rec->counter); @@ -902,7 +934,7 @@ static void unregister_ftrace_profiler(void) static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(ftrace_profile_ops) + INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -989,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -1025,23 +1057,37 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } } - entry = debugfs_create_file("function_profile_enabled", 0644, + entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ static struct pid * const ftrace_swapper_pid = &init_struct_pid; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_graph_active; +#else +# define ftrace_graph_active 0 +#endif + #ifdef CONFIG_DYNAMIC_FTRACE +static struct ftrace_ops *removed_ops; + +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -1082,13 +1128,51 @@ static const struct ftrace_hash empty_hash = { #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, }; +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + struct ftrace_ops *op; + bool ret = false; + + /* + * Some of the ops may be dynamically allocated, + * they are freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This is to check for dynamically allocated trampolines. + * Trampolines that are in kernel text will have + * core_kernel_text() return true. + */ + if (op->trampoline && op->trampoline_size) + if (addr >= op->trampoline && + addr < op->trampoline + op->trampoline_size) { + ret = true; + goto out; + } + } while_for_each_ftrace_op(op); + + out: + preempt_enable_notrace(); + + return ret; +} + struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -1227,8 +1311,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); - free_ftrace_hash(ops->filter_hash); - free_ftrace_hash(ops->notrace_hash); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); } static struct ftrace_hash *alloc_ftrace_hash(int size_bits) @@ -1289,9 +1373,12 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) } static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash); static int ftrace_hash_move(struct ftrace_ops *ops, int enable, @@ -1300,29 +1387,23 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_func_entry *entry; struct hlist_node *tn; struct hlist_head *hhd; - struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; int size = src->count; int bits = 0; int ret; int i; - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(ops, enable); + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; /* * If the new source is empty, just free dst and assign it * the empty_hash. */ if (!src->count) { - free_ftrace_hash_rcu(*dst); - rcu_assign_pointer(*dst, EMPTY_HASH); - /* still need to update the function records */ - ret = 0; - goto out; + new_hash = EMPTY_HASH; + goto update; } /* @@ -1335,10 +1416,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; - ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - goto out; + return -ENOMEM; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1349,20 +1429,43 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } - old_hash = *dst; - rcu_assign_pointer(*dst, new_hash); - free_ftrace_hash_rcu(old_hash); +update: + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ + if (enable) { + /* IPMODIFY should be updated only when filter_hash updating */ + ret = ftrace_hash_ipmodify_update(ops, new_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return ret; + } + } - ret = 0; - out: /* - * Enable regardless of ret: - * On success, we enable the new hash. - * On failure, we re-enable the original hash. + * Remove the current set, update the hash and add + * them back. */ - ftrace_hash_rec_enable(ops, enable); + ftrace_hash_rec_disable_modify(ops, enable); - return ret; + rcu_assign_pointer(*dst, new_hash); + + ftrace_hash_rec_enable_modify(ops, enable); + + return 0; +} + +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an emty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !ftrace_lookup_ip(hash->notrace_hash, ip)); } /* @@ -1380,8 +1483,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_ops_hash hash; int ret; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS @@ -1394,13 +1496,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); - if ((ftrace_hash_empty(filter_hash) || - ftrace_lookup_ip(filter_hash, ip)) && - (ftrace_hash_empty(notrace_hash) || - !ftrace_lookup_ip(notrace_hash, ip))) + if (hash_contains_ip(ip, &hash)) ret = 1; else ret = 0; @@ -1492,6 +1591,26 @@ int ftrace_text_reserved(const void *start, const void *end) return (int)!!ret; } +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + bool keep_regs = false; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(ops, rec->ip, rec)) { + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + keep_regs = true; + break; + } + } + } + + return keep_regs; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1519,14 +1638,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * gets inversed. */ if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; /* * If the notrace hash has no items, * then there's nothing to do. @@ -1572,8 +1691,25 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return; + + /* + * If there's only a single callback registered to a + * function, and the ops has a trampoline registered + * for it, then we can call it directly. + */ + if (ftrace_rec_count(rec) == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + /* + * If we are adding another function callback + * to this function, and the previous had a + * custom trampoline in use, then we need to go + * back to the default trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + /* * If any ops wants regs saved for this function * then all ops will get saved regs. @@ -1581,9 +1717,38 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) rec->flags |= FTRACE_FL_REGS; } else { - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) return; rec->flags--; + + /* + * If the rec had REGS enabled and the ops that is + * being removed had REGS set, then see if there is + * still any ops for this record that wants regs. + * If not, we can stop recording them. + */ + if (ftrace_rec_count(rec) > 0 && + rec->flags & FTRACE_FL_REGS && + ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + if (!test_rec_ops_needs_regs(rec)) + rec->flags &= ~FTRACE_FL_REGS; + } + + /* + * If the rec had TRAMP enabled, then it needs to + * be cleared. As TRAMP can only be enabled iff + * there is only a single ops attached to it. + * In otherwords, always disable it on decrementing. + * In the future, we may set it if rec count is + * decremented to one, and the ops that is left + * has a trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* + * flags will be cleared in ftrace_check_record() + * if rec count is zero. + */ } count++; /* Shortcut, if we handled all records, we are done. */ @@ -1604,6 +1769,149 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + +/* + * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK + * or no-needed to update, -EBUSY if it detects a conflict of the flag + * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. + * Note that old_hash and new_hash has below meanings + * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) + * - If the hash is EMPTY_HASH, it hits nothing + * - Anything else hits the recs which match the hash entries. + */ +static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, + struct ftrace_hash *old_hash, + struct ftrace_hash *new_hash) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec, *end = NULL; + int in_old, in_new; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + /* + * Since the IPMODIFY is a very address sensitive action, we do not + * allow ftrace_ops to set all functions to new hash. + */ + if (!new_hash || !old_hash) + return -EINVAL; + + /* Update rec->flags */ + do_for_each_ftrace_rec(pg, rec) { + /* We need to update only differences of filter_hash */ + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) { + /* New entries must ensure no others are using it */ + if (rec->flags & FTRACE_FL_IPMODIFY) + goto rollback; + rec->flags |= FTRACE_FL_IPMODIFY; + } else /* Removed entry */ + rec->flags &= ~FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + + return 0; + +rollback: + end = rec; + + /* Roll back what we did above */ + do_for_each_ftrace_rec(pg, rec) { + if (rec == end) + goto err_out; + + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) + rec->flags &= ~FTRACE_FL_IPMODIFY; + else + rec->flags |= FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + +err_out: + return -EBUSY; +} + +static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); +} + +/* Disabling always succeeds */ +static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); +} + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash) +{ + struct ftrace_hash *old_hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(old_hash)) + old_hash = NULL; + + if (ftrace_hash_empty(new_hash)) + new_hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1614,10 +1922,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); + /** * ftrace_bug - report and shutdown function tracer * @failed: The failed type (EFAULT, EINVAL, EPERM) - * @ip: The address that failed + * @rec: The record that failed * * The arch code that enables or disables the function tracing * can call ftrace_bug() when it has detected a problem in @@ -1626,8 +1937,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p) * EINVAL - if what is read at @ip is not what was expected * EPERM - if the problem happens on writting to the @ip address */ -void ftrace_bug(int failed, unsigned long ip) +void ftrace_bug(int failed, struct dyn_ftrace *rec) { + unsigned long ip = rec ? rec->ip : 0; + switch (failed) { case -EFAULT: FTRACE_WARN_ON_ONCE(1); @@ -1639,7 +1952,7 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1651,6 +1964,24 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + if (rec) { + struct ftrace_ops *ops = NULL; + + pr_info("ftrace record flags: %lx\n", rec->flags); + pr_cont(" (%ld)%s", ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + pr_cont("\ttramp: %pS", + (void *)ops->trampoline); + else + pr_cont("\ttramp: ERROR!"); + + } + ip = ftrace_get_addr_curr(rec); + pr_cont(" expected tramp: %lx\n", ip); + } } static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) @@ -1668,17 +1999,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && ftrace_rec_count(rec)) flag = FTRACE_FL_ENABLED; /* - * If enabling and the REGS flag does not match the REGS_EN, then - * do not ignore this record. Set flags to fail the compare against - * ENABLED. + * If enabling and the REGS flag does not match the REGS_EN, or + * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore + * this record. Set flags to fail the compare against ENABLED. */ - if (flag && - (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) - flag |= FTRACE_FL_REGS; + if (flag) { + if (!(rec->flags & FTRACE_FL_REGS) != + !(rec->flags & FTRACE_FL_REGS_EN)) + flag |= FTRACE_FL_REGS; + + if (!(rec->flags & FTRACE_FL_TRAMP) != + !(rec->flags & FTRACE_FL_TRAMP_EN)) + flag |= FTRACE_FL_TRAMP; + } /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1696,6 +2033,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) else rec->flags &= ~FTRACE_FL_REGS_EN; } + if (flag & FTRACE_FL_TRAMP) { + if (rec->flags & FTRACE_FL_TRAMP) + rec->flags |= FTRACE_FL_TRAMP_EN; + else + rec->flags &= ~FTRACE_FL_TRAMP_EN; + } } /* @@ -1704,7 +2047,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert * from the save regs, to a non-save regs function or - * vice versa. + * vice versa, or from a trampoline call. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; @@ -1714,11 +2057,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (update) { /* If there's no more users, clear all flags */ - if (!(rec->flags & ~FTRACE_FL_MASK)) + if (!ftrace_rec_count(rec)) rec->flags = 0; else - /* Just disable the record (keep REGS state) */ - rec->flags &= ~FTRACE_FL_ENABLED; + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN); } return FTRACE_UPDATE_MAKE_NOP; @@ -1751,6 +2098,109 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) + return removed_ops; + } + + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) + continue; + + + /* + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. + */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) + return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; + + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* pass rec in as regs to have non-NULL val */ + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -1763,6 +2213,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP) { + ops = ftrace_find_tramp_ops_new(rec); + if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS) return (unsigned long)FTRACE_REGS_ADDR; else @@ -1781,6 +2245,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { + struct ftrace_ops *ops; + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_curr(rec); + if (FTRACE_WARN_ON(!ops)) { + pr_warning("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + if (rec->flags & FTRACE_FL_REGS_EN) return (unsigned long)FTRACE_REGS_ADDR; else @@ -1809,7 +2287,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); + return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); @@ -1830,7 +2308,7 @@ void __weak ftrace_replace_code(int enable) do_for_each_ftrace_rec(pg, rec) { failed = __ftrace_replace_code(rec, enable); if (failed) { - ftrace_bug(failed, rec->ip); + ftrace_bug(failed, rec); /* Stop processing */ return; } @@ -1912,17 +2390,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { - unsigned long ip; int ret; - ip = rec->ip; - if (unlikely(ftrace_disabled)) return 0; ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { - ftrace_bug(ret, ip); + ftrace_bug(ret, rec); return 0; } return 1; @@ -2031,11 +2506,6 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; - /* - * Do not call function tracer while we update the code. - * We are in stop machine. - */ - function_trace_stop++; /* * By default we use stop_machine() to modify the code. @@ -2045,15 +2515,28 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - function_trace_stop--; - ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); } +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_ops_hash *old_hash) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash->filter_hash; + ops->old_hash.notrace_hash = old_hash->notrace_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; +} + static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; -static int global_start_up; + +void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ +} static void control_ops_free(struct ftrace_ops *ops) { @@ -2073,6 +2556,13 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + static int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2087,12 +2577,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - ops->flags |= FTRACE_OPS_FL_ENABLED; + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; + + ret = ftrace_hash_ipmodify_enable(ops); + if (ret < 0) { + /* Rollback registration process */ + __unregister_ftrace_function(ops); + ftrace_start_up--; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + return ret; + } ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); + ops->flags &= ~FTRACE_OPS_FL_ADDING; + return 0; } @@ -2115,10 +2624,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); + /* Disabling ipmodify never fails */ + ftrace_hash_ipmodify_disable(ops); ftrace_hash_rec_disable(ops, 1); - if (!global_start_up) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -2139,9 +2649,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } + /* + * If the ops uses a trampoline, then it needs to be + * tested first on update. + */ + ops->flags |= FTRACE_OPS_FL_REMOVING; + removed_ops = ops; + + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + ftrace_run_update_code(command); /* + * If there's no more ops registered with ftrace, run a + * sanity check to make sure all rec flags are cleared. + */ + if (ftrace_ops_list == &ftrace_list_end) { + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (FTRACE_WARN_ON_ONCE(rec->flags)) + pr_warn(" %pS flags:%lx\n", + (void *)rec->ip, rec->flags); + } while_for_each_ftrace_rec(); + } + + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + + removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; + + /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. * The same goes for freeing the per_cpu data of the control @@ -2158,6 +2700,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { schedule_on_each_cpu(ftrace_sync); + arch_ftrace_trampoline_free(ops); + if (ops->flags & FTRACE_OPS_FL_CONTROL) control_ops_free(ops); } @@ -2167,24 +2711,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) static void ftrace_startup_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* Force update next time */ saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } } static void ftrace_shutdown_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_DISABLE_CALLS); + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } } static cycle_t ftrace_update_time; @@ -2196,8 +2752,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) * Filter_hash being empty will default to trace module. * But notrace hash requires a test of individual module functions. */ - return ftrace_hash_empty(ops->filter_hash) && - ftrace_hash_empty(ops->notrace_hash); + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); } /* @@ -2219,12 +2775,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 0; /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->filter_hash) && - !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) return 0; return 1; @@ -2310,7 +2866,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) - ftrace_bug(failed, p->ip); + ftrace_bug(failed, p); } } } @@ -2398,7 +2954,8 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - while (start_pg) { + pg = start_pg; + while (pg) { order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); start_pg = pg->next; @@ -2544,10 +3101,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } else { rec = &iter->pg->records[iter->idx++]; if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -2595,8 +3152,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) { + if ((iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->func_hash->filter_hash)) || + (iter->flags & FTRACE_ITER_NOTRACE && + ftrace_hash_empty(ops->func_hash->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -2632,6 +3191,22 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&ftrace_lock); } +void * __weak +arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return NULL; +} + +static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + void *ptr; + + ptr = arch_ftrace_trampoline_func(ops, rec); + if (ptr) + seq_printf(m, " ->%pS", ptr); +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -2641,7 +3216,10 @@ static int t_show(struct seq_file *m, void *v) return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { - seq_printf(m, "#### all functions enabled ####\n"); + if (iter->flags & FTRACE_ITER_NOTRACE) + seq_puts(m, "#### no functions disabled ####\n"); + else + seq_puts(m, "#### all functions enabled ####\n"); return 0; } @@ -2651,11 +3229,26 @@ static int t_show(struct seq_file *m, void *v) return 0; seq_printf(m, "%ps", (void *)rec->ip); - if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)%s", - rec->flags & ~FTRACE_FL_MASK, - rec->flags & FTRACE_FL_REGS ? " R" : ""); - seq_printf(m, "\n"); + if (iter->flags & FTRACE_ITER_ENABLED) { + struct ftrace_ops *ops = NULL; + + seq_printf(m, " (%ld)%s%s", + ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + seq_printf(m, "\ttramp: %pS", + (void *)ops->trampoline); + else + seq_puts(m, "\ttramp: ERROR!"); + + } + add_trampoline_func(m, ops, rec); + } + + seq_putc(m, '\n'); return 0; } @@ -2689,9 +3282,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - if (unlikely(ftrace_disabled)) - return -ENODEV; - iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (iter) { iter->pg = ftrace_pages_start; @@ -2702,13 +3292,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) return iter ? 0 : -ENOMEM; } -static void ftrace_filter_reset(struct ftrace_hash *hash) -{ - mutex_lock(&ftrace_lock); - ftrace_hash_clear(hash); - mutex_unlock(&ftrace_lock); -} - /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -2750,15 +3333,21 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; + hash = ops->func_hash->notrace_hash; else - hash = ops->filter_hash; + hash = ops->func_hash->filter_hash; if (file->f_mode & FMODE_WRITE) { - iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) + iter->hash = alloc_ftrace_hash(size_bits); + else + iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); @@ -2767,10 +3356,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_filter_reset(iter->hash); - if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -2788,7 +3373,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, file->private_data = iter; out_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); return ret; } @@ -3026,12 +3611,12 @@ static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(trace_probe_ops) + INIT_OPS_HASH(trace_probe_ops) }; static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) { int ret; int i; @@ -3039,7 +3624,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3088,8 +3674,10 @@ int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3106,9 +3694,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) { count = -ENOMEM; goto out; @@ -3167,15 +3759,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - if (ret < 0) - count = ret; - __enable_ftrace_function_probe(); + __enable_ftrace_function_probe(&old_hash_ops); + + if (!ret) + free_ftrace_hash_rcu(old_hash); + else + count = ret; out_unlock: mutex_unlock(&ftrace_lock); out: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); return count; @@ -3193,7 +3788,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3201,6 +3797,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int type = MATCH_FULL; int i, len = 0; char *search; + int ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; @@ -3215,7 +3812,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3259,8 +3856,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ - ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); synchronize_sched(); + if (!ret) + free_ftrace_hash_rcu(old_hash); + list_for_each_entry_safe(entry, p, &free_list, free_list) { list_del(&entry->free_list); ftrace_free_entry(entry); @@ -3268,7 +3868,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); } @@ -3447,10 +4047,35 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); } static int @@ -3458,27 +4083,31 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (enable) - orig_hash = &ops->filter_hash; + orig_hash = &ops->func_hash->filter_hash; else - orig_hash = &ops->notrace_hash; + orig_hash = &ops->func_hash->notrace_hash; + + if (reset) + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + else + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { ret = -ENOMEM; goto out_regex_unlock; } - if (reset) - ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { ret = -EINVAL; goto out_regex_unlock; @@ -3490,14 +4119,18 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) - ftrace_ops_update_code(ops); - + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(hash); return ret; @@ -3630,8 +4263,12 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static unsigned long save_global_trampoline; +static unsigned long save_global_flags; + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -3639,16 +4276,29 @@ static int __init set_graph_function(char *str) } __setup("ftrace_graph_filter=", set_graph_function); -static void __init set_ftrace_early_graph(char *buf) +static int __init set_graph_notrace_function(char *str) +{ + strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; + unsigned long *table = ftrace_graph_funcs; + int *count = &ftrace_graph_count; + + if (!enable) { + table = ftrace_graph_notrace_funcs; + count = &ftrace_graph_notrace_count; + } while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3677,15 +4327,19 @@ static void __init set_ftrace_early_filters(void) ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) - set_ftrace_early_graph(ftrace_graph_buf); + set_ftrace_early_graph(ftrace_graph_buf, 1); + if (ftrace_graph_notrace_buf[0]) + set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -3704,26 +4358,30 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); - mutex_lock(&iter->ops->regex_lock); + mutex_lock(&iter->ops->func_hash->regex_lock); if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); if (filter_hash) - orig_hash = &iter->ops->filter_hash; + orig_hash = &iter->ops->func_hash->filter_hash; else - orig_hash = &iter->ops->notrace_hash; + orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); + old_hash = *orig_hash; + old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret) - ftrace_ops_update_code(iter->ops); - + if (!ret) { + ftrace_ops_update_code(iter->ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); } - mutex_unlock(&iter->ops->regex_lock); + mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); @@ -3819,7 +4477,12 @@ static int g_show(struct seq_file *m, void *v) return 0; if (ptr == (unsigned long *)1) { - seq_printf(m, "#### all functions enabled ####\n"); + struct ftrace_graph_data *fgd = m->private; + + if (fgd->table == ftrace_graph_funcs) + seq_puts(m, "#### all functions enabled ####\n"); + else + seq_puts(m, "#### no functions disabled ####\n"); return 0; } @@ -4062,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) mutex_unlock(&ftrace_lock); } -static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { trace_create_file("available_filter_functions", 0444, @@ -4330,12 +4993,37 @@ void __init ftrace_init(void) ftrace_disabled = 1; } +/* Do nothing if arch does not support this */ +void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ + +/* + * Currently there's no safe way to free a trampoline when the kernel + * is configured with PREEMPT. That is because a task could be preempted + * when it jumped to the trampoline, it may be preempted for a long time + * depending on the system load, and currently there's no way to know + * when it will be off the trampoline. If the trampoline is freed + * too early, when the task runs again, it will be executing on freed + * memory and crash. + */ +#ifdef CONFIG_PREEMPT + /* Currently, only non dynamic ops can have a trampoline */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + return; +#endif + + arch_ftrace_update_trampoline(ops); +} + #else static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4345,8 +5033,9 @@ static int __init ftrace_nodyn_init(void) } core_initcall(ftrace_nodyn_init); -static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } +static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ @@ -4372,6 +5061,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 1; } +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + #endif /* CONFIG_DYNAMIC_FTRACE */ __init void ftrace_init_global_array_ops(struct trace_array *tr) @@ -4437,7 +5130,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(control_ops) + INIT_OPS_HASH(control_ops) }; static inline void @@ -4447,9 +5140,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - if (function_trace_stop) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -4461,9 +5151,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip, regs)) { - if (WARN_ON(!op->func)) { - function_trace_stop = 1; - printk("op=%p %pS\n", op, op); + if (FTRACE_WARN_ON(!op->func)) { + pr_warn("op=%p %pS\n", op, op); goto out; } op->func(ip, parent_ip, op, regs); @@ -4500,6 +5189,49 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) } #endif +/* + * If there's only one function registered but it does not support + * recursion, this function will be called by the mcount trampoline. + * This function will handle recursion protection. + */ +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + int bit; + + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; + + op->func(ip, parent_ip, op, regs); + + trace_clear_recursion(bit); +} + +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_recurs_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) + return ftrace_ops_recurs_func; + + return ops->func; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -4600,7 +5332,8 @@ static int ftrace_pid_add(int p) set_ftrace_pid_task(pid); ftrace_update_pid_func(); - ftrace_startup_enable(0); + + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); return 0; @@ -4629,7 +5362,7 @@ static void ftrace_pid_reset(void) } ftrace_update_pid_func(); - ftrace_startup_enable(0); + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); } @@ -4662,12 +5395,12 @@ static int fpid_show(struct seq_file *m, void *v) const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); if (v == (void *)1) { - seq_printf(m, "no pid\n"); + seq_puts(m, "no pid\n"); return 0; } if (fpid->pid == ftrace_swapper_pid) - seq_printf(m, "swapper tasks\n"); + seq_puts(m, "swapper tasks\n"); else seq_printf(m, "%u\n", pid_vnr(fpid->pid)); @@ -4746,24 +5479,24 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_debugfs(void) +static __init int ftrace_init_tracefs(void) { struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; - ftrace_init_dyn_debugfs(d_tracer); + ftrace_init_dyn_tracefs(d_tracer); trace_create_file("set_ftrace_pid", 0644, d_tracer, NULL, &ftrace_pid_fops); - ftrace_profile_debugfs(d_tracer); + ftrace_profile_tracefs(d_tracer); return 0; } -fs_initcall(ftrace_init_debugfs); +fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace @@ -4853,12 +5586,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { - ftrace_startup_sysctl(); - /* we are starting ftrace again */ if (ftrace_ops_list != &ftrace_list_end) update_ftrace_function(); + ftrace_startup_sysctl(); + } else { /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; @@ -4873,7 +5606,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_graph_active; +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -5035,12 +5778,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) */ static void update_function_graph_func(void) { - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list == &global_ops && - global_ops.next == &ftrace_list_end)) - ftrace_graph_entry = __ftrace_graph_entry; - else + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; } static struct notifier_block ftrace_suspend_notifier = { @@ -5081,11 +5840,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - /* Function graph doesn't use the .func field of global_ops */ - global_ops.flags |= FTRACE_OPS_FL_STUB; - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); - + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); return ret; @@ -5102,11 +5857,21 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - global_ops.flags &= ~FTRACE_OPS_FL_STUB; + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); +#ifdef CONFIG_DYNAMIC_FTRACE + /* + * Function graph does not allocate the trampoline, but + * other global_ops do. We need to reset the ALLOC_TRAMP flag + * if one was used. + */ + global_ops.trampoline = save_global_trampoline; + if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) + global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; +#endif + out: mutex_unlock(&ftrace_lock); } @@ -5183,9 +5948,4 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } - -void ftrace_graph_stop(void) -{ - ftrace_stop(); -} #endif diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 1c71382b283d..eb4220a132ec 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ff7027199a9a..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,7 +9,6 @@ #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ @@ -23,7 +22,6 @@ #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> -#include <linux/fs.h> #include <asm/local.h> @@ -34,21 +32,19 @@ static void update_pages_handler(struct work_struct *work); */ int ring_buffer_print_entry_header(struct trace_seq *s) { - int ret; - - ret = trace_seq_puts(s, "# compressed entry header\n"); - ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_puts(s, "\tarray : 32 bits\n"); - ret = trace_seq_putc(s, '\n'); - ret = trace_seq_printf(s, "\tpadding : type == %d\n", - RINGBUF_TYPE_PADDING); - ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", - RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata max type_len == %d\n", - RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + trace_seq_puts(s, "# compressed entry header\n"); + trace_seq_puts(s, "\ttype_len : 5 bits\n"); + trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + trace_seq_puts(s, "\tarray : 32 bits\n"); + trace_seq_putc(s, '\n'); + trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); - return ret; + return !trace_seq_has_overflowed(s); } /* @@ -419,38 +415,40 @@ static inline int test_time_stamp(u64 delta) int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; - int ret; - ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); - return ret; + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); } struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; + wait_queue_head_t full_waiters; bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; }; /* @@ -532,31 +530,39 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); + if (rbwork->wakeup_full) { + rbwork->wakeup_full = false; + wake_up_all(&rbwork->full_waiters); + } } /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + /* Full only makes sense on per cpu reads */ + full = false; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; @@ -564,36 +570,70 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } - finish_wait(&work->waiters, &wait); - return 0; + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); + + return ret; } /** @@ -626,8 +666,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) @@ -1192,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1689,22 +1744,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - /* The update must run on the CPU that is being updated. */ - preempt_disable(); - if (cpu == smp_processor_id() || !cpu_online(cpu)) { + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - preempt_disable(); } - preempt_enable(); } /* wait for all the updates to complete */ @@ -1742,22 +1789,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - preempt_disable(); - /* The update must run on the CPU that is being updated. */ - if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { - /* - * Can not disable preemption for schedule_work_on() - * on PREEMPT_RT. - */ - preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - preempt_disable(); } - preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); @@ -1984,7 +2023,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -2640,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); static __always_inline int trace_recursive_lock(void) { - unsigned int val = this_cpu_read(current_context); + unsigned int val = __this_cpu_read(current_context); int bit; if (in_interrupt()) { @@ -2657,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void) return 1; val |= (1 << bit); - this_cpu_write(current_context, val); + __this_cpu_write(current_context, val); return 0; } static __always_inline void trace_recursive_unlock(void) { - unsigned int val = this_cpu_read(current_context); - - val--; - val &= this_cpu_read(current_context); - this_cpu_write(current_context, val); + __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); } #else @@ -2779,6 +2814,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { + bool pagebusy; + if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -2790,6 +2827,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /** @@ -3357,21 +3403,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = cpu_buffer->read; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** @@ -3764,18 +3805,20 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; - if (iter->head >= local_read(&iter->head_page->page->commit)) { + if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..13d945c0d03f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -7,7 +7,7 @@ #include <linux/completion.h> #include <linux/kthread.h> #include <linux/module.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <asm/local.h> struct rb_page { @@ -17,7 +17,7 @@ struct rb_page { }; /* run time and sleep time in seconds */ -#define RUN_TIME 10 +#define RUN_TIME 10ULL #define SLEEP_TIME 10 /* number of events for writer to wake up the reader */ @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) break; schedule(); - __set_current_state(TASK_RUNNING); } reader_finish = 0; complete(&read_done); @@ -213,8 +212,7 @@ static void ring_buffer_consumer(void) static void ring_buffer_producer(void) { - struct timeval start_tv; - struct timeval end_tv; + ktime_t start_time, end_time, timeout; unsigned long long time; unsigned long long entries; unsigned long long overruns; @@ -228,7 +226,8 @@ static void ring_buffer_producer(void) * make the system stall) */ trace_printk("Starting ring buffer hammer\n"); - do_gettimeofday(&start_tv); + start_time = ktime_get(); + timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC); do { struct ring_buffer_event *event; int *entry; @@ -245,7 +244,7 @@ static void ring_buffer_producer(void) ring_buffer_unlock_commit(buffer, event); } } - do_gettimeofday(&end_tv); + end_time = ktime_get(); cnt++; if (consumer && !(cnt % wakeup_interval)) @@ -265,7 +264,7 @@ static void ring_buffer_producer(void) cond_resched(); #endif - } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + } while (ktime_before(end_time, timeout) && !kill_test); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -281,9 +280,7 @@ static void ring_buffer_producer(void) wait_for_completion(&read_done); } - time = end_tv.tv_sec - start_tv.tv_sec; - time *= USEC_PER_SEC; - time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + time = ktime_us_delta(end_time, start_time); entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); @@ -379,7 +376,6 @@ static int ring_buffer_consumer_thread(void *arg) break; schedule(); - __set_current_state(TASK_RUNNING); } __set_current_state(TASK_RUNNING); @@ -407,7 +403,6 @@ static int ring_buffer_producer_thread(void *arg) trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); } if (kill_test) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 291397e66669..91eecaaa43e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> +#include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> @@ -31,6 +32,7 @@ #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> +#include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> @@ -63,6 +65,10 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; +/* Pipe tracepoints to printk */ +struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -119,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +/* Map of enums to their values, for "enum_map" file */ +struct trace_enum_map_head { + struct module *mod; + unsigned long length; +}; + +union trace_enum_map_item; + +struct trace_enum_map_tail { + /* + * "end" is first and points to NULL as it must be different + * than "mod" or "enum_string" + */ + union trace_enum_map_item *next; + const char *end; /* points to NULL */ +}; + +static DEFINE_MUTEX(trace_enum_mutex); + +/* + * The trace_enum_maps are saved in an array with two extra elements, + * one at the beginning, and one at the end. The beginning item contains + * the count of the saved maps (head.length), and the module they + * belong to if not built in (head.mod). The ending item contains a + * pointer to the next array of saved enum_map items. + */ +union trace_enum_map_item { + struct trace_enum_map map; + struct trace_enum_map_head head; + struct trace_enum_map_tail tail; +}; + +static union trace_enum_map_item *trace_enum_maps; +#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ + static int tracing_set_tracer(struct trace_array *tr, const char *buf); #define MAX_TRACER_SIZE 100 @@ -155,10 +197,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { - __disable_trace_on_warning = 1; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; return 1; } -__setup("traceoff_on_warning=", stop_trace_on_warning); +__setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { @@ -192,6 +235,13 @@ static int __init set_trace_boot_clock(char *str) } __setup("trace_clock=", set_trace_boot_clock); +static int __init set_tracepoint_printk(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); unsigned long long ns2usecs(cycle_t nsec) { @@ -820,11 +870,12 @@ static struct { const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local", 1 }, - { trace_clock_global, "global", 1 }, - { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 0 }, - { trace_clock, "perf", 1 }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, ARCH_TRACE_CLOCKS }; @@ -937,43 +988,20 @@ out: return ret; } -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; -} - +/* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (s->len <= s->readpos) + if (trace_seq_used(s) <= s->seq.readpos) return -EBUSY; - len = s->len - s->readpos; + len = trace_seq_used(s) - s->seq.readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->readpos, cnt); + memcpy(buf, s->buffer + s->seq.readpos, cnt); - s->readpos += cnt; + s->seq.readpos += cnt; return cnt; } @@ -1099,13 +1127,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2045,13 +2074,14 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n**********************************************************\n"); + pr_warning("\n"); + pr_warning("**********************************************************\n"); pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warning("** **\n"); pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); pr_warning("** **\n"); pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** unsafe for production use. **\n"); pr_warning("** **\n"); pr_warning("** If you see this message and you are not debugging **\n"); pr_warning("** the kernel, report this immediately to your vendor! **\n"); @@ -2180,9 +2210,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, goto out; } - len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - if (len > TRACE_BUF_SIZE) - goto out; + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); local_save_flags(flags); size = sizeof(*entry) + len + 1; @@ -2193,8 +2221,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, tbuffer, len); - entry->buf[len] = '\0'; + memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); @@ -2531,14 +2558,14 @@ get_total_entries(struct trace_buffer *buf, static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct trace_buffer *buf, struct seq_file *m) @@ -2555,20 +2582,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" + "# | | | | |\n"); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); + seq_puts(m, "# _-----=> irqs-off\n" + "# / _----=> need-resched\n" + "# | / _---=> hardirq/softirq\n" + "# || / _--=> preempt-depth\n" + "# ||| / delay\n" + "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" + "# | | | |||| | |\n"); } void @@ -2671,24 +2698,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - if (!trace_print_lat_context(iter)) - goto partial; - } else { - if (!trace_print_context(iter)) - goto partial; - } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); } + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + if (event) return event->funcs->trace(iter, sym_flags, event); - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; + trace_seq_printf(s, "Unknown type %d\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) @@ -2699,22 +2723,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts)) - goto partial; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); - if (!trace_seq_printf(s, "%d ?\n", entry->type)) - goto partial; + trace_seq_printf(s, "%d ?\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) @@ -2727,9 +2749,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2739,9 +2763,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return ret; } - SEQ_PUT_FIELD_RET(s, newline); + SEQ_PUT_FIELD(s, newline); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) @@ -2753,9 +2777,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2801,10 +2827,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events && - !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events)) - return TRACE_TYPE_PARTIAL_LINE; + if (iter->lost_events) { + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -2882,44 +2910,44 @@ static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; - seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); #else - seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); - seq_printf(m, "# Must use main snapshot file to allocate.\n"); + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); #endif - seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) - seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else - seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - seq_printf(m, "# Snapshot commands:\n"); + seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else @@ -3273,7 +3301,7 @@ static int t_show(struct seq_file *m, void *v) if (!t) return 0; - seq_printf(m, "%s", t->name); + seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else @@ -3363,12 +3391,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); - if (count - len < 2) { + len = snprintf(mask_str, count, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)); + if (len >= count) { count = -EINVAL; goto out_err; } - len += sprintf(mask_str + len, "\n"); count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); out_err: @@ -3699,6 +3727,7 @@ static const char readme_msg[] = #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT @@ -3917,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +static union trace_enum_map_item * +update_enum_map(union trace_enum_map_item *ptr) +{ + if (!ptr->map.enum_string) { + if (ptr->tail.next) { + ptr = ptr->tail.next; + /* Set ptr to the next real item (skip head) */ + ptr++; + } else + return NULL; + } + return ptr; +} + +static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +{ + union trace_enum_map_item *ptr = v; + + /* + * Paranoid! If ptr points to end, we don't want to increment past it. + * This really should never happen. + */ + ptr = update_enum_map(ptr); + if (WARN_ON_ONCE(!ptr)) + return NULL; + + ptr++; + + (*pos)++; + + ptr = update_enum_map(ptr); + + return ptr; +} + +static void *enum_map_start(struct seq_file *m, loff_t *pos) +{ + union trace_enum_map_item *v; + loff_t l = 0; + + mutex_lock(&trace_enum_mutex); + + v = trace_enum_maps; + if (v) + v++; + + while (v && l < *pos) { + v = enum_map_next(m, v, &l); + } + + return v; +} + +static void enum_map_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&trace_enum_mutex); +} + +static int enum_map_show(struct seq_file *m, void *v) +{ + union trace_enum_map_item *ptr = v; + + seq_printf(m, "%s %ld (%s)\n", + ptr->map.enum_string, ptr->map.enum_value, + ptr->map.system); + + return 0; +} + +static const struct seq_operations tracing_enum_map_seq_ops = { + .start = enum_map_start, + .next = enum_map_next, + .stop = enum_map_stop, + .show = enum_map_show, +}; + +static int tracing_enum_map_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_enum_map_seq_ops); +} + +static const struct file_operations tracing_enum_map_fops = { + .open = tracing_enum_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static inline union trace_enum_map_item * +trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +{ + /* Return tail of array given the head */ + return ptr + ptr->head.length + 1; +} + +static void +trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, + int len) +{ + struct trace_enum_map **stop; + struct trace_enum_map **map; + union trace_enum_map_item *map_array; + union trace_enum_map_item *ptr; + + stop = start + len; + + /* + * The trace_enum_maps contains the map plus a head and tail item, + * where the head holds the module and length of array, and the + * tail holds a pointer to the next list. + */ + map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); + if (!map_array) { + pr_warning("Unable to allocate trace enum mapping\n"); + return; + } + + mutex_lock(&trace_enum_mutex); + + if (!trace_enum_maps) + trace_enum_maps = map_array; + else { + ptr = trace_enum_maps; + for (;;) { + ptr = trace_enum_jmp_to_tail(ptr); + if (!ptr->tail.next) + break; + ptr = ptr->tail.next; + + } + ptr->tail.next = map_array; + } + map_array->head.mod = mod; + map_array->head.length = len; + map_array++; + + for (map = start; (unsigned long)map < (unsigned long)stop; map++) { + map_array->map = **map; + map_array++; + } + memset(map_array, 0, sizeof(*map_array)); + + mutex_unlock(&trace_enum_mutex); +} + +static void trace_create_enum_file(struct dentry *d_tracer) +{ + trace_create_file("enum_map", 0444, d_tracer, + NULL, &tracing_enum_map_fops); +} + +#else /* CONFIG_TRACE_ENUM_MAP_FILE */ +static inline void trace_create_enum_file(struct dentry *d_tracer) { } +static inline void trace_insert_enum_map_file(struct module *mod, + struct trace_enum_map **start, int len) { } +#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ + +static void trace_insert_enum_map(struct module *mod, + struct trace_enum_map **start, int len) +{ + struct trace_enum_map **map; + + if (len <= 0) + return; + + map = start; + + trace_event_enum_update(map, len); + + trace_insert_enum_map_file(mod, start, len); +} + static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4114,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static int tracing_set_tracer(struct trace_array *tr, const char *buf) +static void update_tracer_options(struct trace_array *tr, struct tracer *t) { static struct trace_option_dentry *topts; + + /* Only enable if the directory has been created already. */ + if (!tr->dir) + return; + + /* Currently, only the top instance has options */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return; + + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) +{ struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -4150,6 +4370,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) goto out; } + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->current_trace->ref) { + ret = -EBUSY; + goto out; + } + trace_branch_disable(); tr->current_trace->enabled--; @@ -4175,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - /* Currently, only the top instance has options */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); - } + update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -4238,10 +4460,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; char buf[64]; int r; @@ -4253,10 +4474,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, } static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; unsigned long val; int ret; @@ -4269,6 +4489,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + goto out; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + goto out; + } + + ret = cnt; +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -4291,16 +4557,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) goto out; } - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - ret = -ENOMEM; - goto fail; - } - *iter->trace = *tr->current_trace; + trace_seq_init(&iter->seq); + iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -4327,6 +4585,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); + + tr->current_trace->ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -4346,6 +4606,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + tr->current_trace->ref--; + if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -4353,7 +4615,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) free_cpumask_var(iter->started); mutex_destroy(&iter->mutex); - kfree(iter->trace); kfree(iter); trace_array_put(tr); @@ -4386,7 +4647,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* Must be called with trace_types_lock mutex held. */ +/* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; @@ -4412,15 +4673,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -4434,7 +4692,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -4444,12 +4701,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_seq_init(&iter->seq); - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself @@ -4487,18 +4738,18 @@ waitagain: trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; - int len = iter->seq.len; + int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ - iter->seq.len = len; + iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); - if (iter->seq.len >= cnt) + if (trace_seq_used(&iter->seq) >= cnt) break; /* @@ -4514,7 +4765,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) + if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* @@ -4548,20 +4799,33 @@ static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; + int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { - count = iter->seq.len; + save_len = iter->seq.seq.len; ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; break; } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; break; } @@ -4596,7 +4860,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -4604,12 +4867,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - mutex_lock(&iter->mutex); if (iter->trace->splice_read) { @@ -4642,13 +4899,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - iter->seq.len); + trace_seq_used(&iter->seq)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = iter->seq.len; + spd.partial[i].len = trace_seq_used(&iter->seq); trace_seq_init(&iter->seq); } @@ -4896,7 +5153,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - for (i = 0; i < nr_pages; i++){ + for (i = nr_pages - 1; i >= 0; i--) { kunmap_atomic(map_page[i]); put_page(pages[i]); } @@ -5170,6 +5427,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) #endif /* CONFIG_TRACER_SNAPSHOT */ +static const struct file_operations tracing_thresh_fops = { + .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -5278,6 +5542,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; + tr->current_trace->ref++; + mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); @@ -5308,21 +5574,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - size = -EBUSY; - goto out_unlock; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif if (!info->spare) info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - size = -ENOMEM; if (!info->spare) - goto out_unlock; + return -ENOMEM; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -5338,25 +5599,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) { - size = -EAGAIN; - goto out_unlock; - } - mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); - mutex_lock(&trace_types_lock); - if (ret) { - size = ret; - goto out_unlock; - } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, false); + if (ret) + return ret; + goto again; } - size = 0; - goto out_unlock; + return 0; } info->read = 0; @@ -5366,18 +5618,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) { - size = -EFAULT; - goto out_unlock; - } + if (ret == size) + return -EFAULT; + size -= ret; *ppos += size; info->read += size; - out_unlock: - mutex_unlock(&trace_types_lock); - return size; } @@ -5388,6 +5636,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); + iter->tr->current_trace->ref--; + __trace_array_put(iter->tr); if (info->spare) @@ -5471,32 +5721,22 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - ssize_t ret; - - mutex_lock(&trace_types_lock); + ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) { - ret = -ENOMEM; - goto out; - } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) { - ret = -EINVAL; - goto out; - } + if (*ppos & (PAGE_SIZE - 1)) + return -EINVAL; if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) { - ret = -EINVAL; - goto out; - } + if (len < PAGE_SIZE) + return -EINVAL; len &= PAGE_MASK; } @@ -5509,13 +5749,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) + if (!ref) { + ret = -ENOMEM; break; + } ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { + ret = -ENOMEM; kfree(ref); break; } @@ -5553,26 +5796,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { - ret = -EAGAIN; - goto out; - } - mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); - mutex_lock(&trace_types_lock); if (ret) - goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + return ret; + + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, true); + if (ret) + return ret; + goto again; } ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); -out: - mutex_unlock(&trace_types_lock); return ret; } @@ -5642,7 +5880,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -5723,10 +5962,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%ps:", (void *)ip); - seq_printf(m, "snapshot"); + seq_puts(m, "snapshot"); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); @@ -5801,28 +6040,19 @@ static __init int register_snapshot_cmd(void) static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (tr->dir) - return tr->dir; - - if (!debugfs_initialized()) - return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - tr->dir = debugfs_create_dir("tracing", NULL); - - if (!tr->dir) - pr_warn_once("Could not create debugfs directory 'tracing'\n"); + return NULL; + /* All sub buffers have a descriptor */ return tr->dir; } -struct dentry *tracing_init_dentry(void) -{ - return tracing_init_dentry_tr(&global_trace); -} - static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; @@ -5830,14 +6060,14 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (tr->percpu_dir) return tr->percpu_dir; - d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) return NULL; - tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); WARN_ONCE(!tr->percpu_dir, - "Could not create debugfs directory 'per_cpu/%d'\n", cpu); + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } @@ -5854,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, } static void -tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; @@ -5864,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); - d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6018,9 +6248,9 @@ struct dentry *trace_create_file(const char *name, { struct dentry *ret; - ret = debugfs_create_file(name, mode, parent, data, fops); + ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create debugfs '%s' entry\n", name); + pr_warning("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6033,13 +6263,13 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (tr->options) return tr->options; - d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) return NULL; - tr->options = debugfs_create_dir("options", d_tracer); + tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create debugfs directory 'options'\n"); + pr_warning("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6107,10 +6337,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts) if (!topts) return; - for (cnt = 0; topts[cnt].opt; cnt++) { - if (topts[cnt].entry) - debugfs_remove(topts[cnt].entry); - } + for (cnt = 0; topts[cnt].opt; cnt++) + tracefs_remove(topts[cnt].entry); kfree(topts); } @@ -6199,7 +6427,7 @@ static const struct file_operations rb_simple_fops = { struct dentry *trace_instance_dir; static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) @@ -6276,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr) #endif } -static int new_instance_create(const char *name) +static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; @@ -6315,17 +6543,17 @@ static int new_instance_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = debugfs_create_dir(name, trace_instance_dir); + tr->dir = tracefs_create_dir(name, trace_instance_dir); if (!tr->dir) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); goto out_free_tr; } - init_tracer_debugfs(tr, tr->dir); + init_tracer_tracefs(tr, tr->dir); list_add(&tr->list, &ftrace_trace_arrays); @@ -6346,7 +6574,7 @@ static int new_instance_create(const char *name) } -static int instance_delete(const char *name) +static int instance_rmdir(const char *name) { struct trace_array *tr; int found = 0; @@ -6365,7 +6593,7 @@ static int instance_delete(const char *name) goto out_unlock; ret = -EBUSY; - if (tr->ref) + if (tr->ref || (tr->current_trace && tr->current_trace->ref)) goto out_unlock; list_del(&tr->list); @@ -6387,82 +6615,17 @@ static int instance_delete(const char *name) return ret; } -static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* - * The inode mutex is locked, but debugfs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the new_instance_create() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = new_instance_create(dentry->d_iname); - - mutex_lock(&inode->i_mutex); - - return ret; -} - -static int instance_rmdir(struct inode *inode, struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* The caller did a dget() on dentry */ - mutex_unlock(&dentry->d_inode->i_mutex); - - /* - * The inode mutex is locked, but debugfs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the instance_delete() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = instance_delete(dentry->d_iname); - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); - - return ret; -} - -static const struct inode_operations instance_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = instance_mkdir, - .rmdir = instance_rmdir, -}; - static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = debugfs_create_dir("instances", d_tracer); + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); if (WARN_ON(!trace_instance_dir)) return; - - /* Hijack the dir inode operations, to allow mkdir */ - trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; } static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; @@ -6516,24 +6679,162 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) #endif for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(tr, cpu); + tracing_init_tracefs_percpu(tr, cpu); + +} + +static struct vfsmount *trace_automount(void *ingore) +{ + struct vfsmount *mnt; + struct file_system_type *type; + + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; + mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + return NULL; + mntget(mnt); + + return mnt; +} + +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +struct dentry *tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; + + /* The top level trace array uses NULL as parent */ + if (tr->dir) + return NULL; + + if (WARN_ON(!debugfs_initialized())) + return ERR_PTR(-ENODEV); + + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kerenl. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); + if (!tr->dir) { + pr_warn_once("Could not create debugfs directory 'tracing'\n"); + return ERR_PTR(-ENOMEM); + } + + return NULL; +} +extern struct trace_enum_map *__start_ftrace_enum_maps[]; +extern struct trace_enum_map *__stop_ftrace_enum_maps[]; + +static void __init trace_enum_init(void) +{ + int len; + + len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; + trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); +} + +#ifdef CONFIG_MODULES +static void trace_module_add_enums(struct module *mod) +{ + if (!mod->num_trace_enums) + return; + + /* + * Modules with bad taint do not have events created, do + * not bother with enums either. + */ + if (trace_module_has_bad_taint(mod)) + return; + + trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); +} + +#ifdef CONFIG_TRACE_ENUM_MAP_FILE +static void trace_module_remove_enums(struct module *mod) +{ + union trace_enum_map_item *map; + union trace_enum_map_item **last = &trace_enum_maps; + + if (!mod->num_trace_enums) + return; + + mutex_lock(&trace_enum_mutex); + + map = trace_enum_maps; + + while (map) { + if (map->head.mod == mod) + break; + map = trace_enum_jmp_to_tail(map); + last = &map->tail.next; + map = map->tail.next; + } + if (!map) + goto out; + + *last = trace_enum_jmp_to_tail(map)->tail.next; + kfree(map); + out: + mutex_unlock(&trace_enum_mutex); } +#else +static inline void trace_module_remove_enums(struct module *mod) { } +#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_enums(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_enums(mod); + break; + } + + return 0; +} + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; +#endif /* CONFIG_MODULES */ -static __init int tracer_init_debugfs(void) +static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; trace_access_lock_init(); d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; - init_tracer_debugfs(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); + &global_trace, &tracing_thresh_fops); trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); @@ -6544,6 +6845,14 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_enum_init(); + + trace_create_enum_file(d_tracer); + +#ifdef CONFIG_MODULES + register_module_notifier(&trace_module_nb); +#endif + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6553,6 +6862,10 @@ static __init int tracer_init_debugfs(void) create_trace_options_dir(&global_trace); + /* If the tracer was started via cmdline, create options for it here */ + if (global_trace.current_trace != &nop_trace) + update_tracer_options(&global_trace, global_trace.current_trace); + return 0; } @@ -6607,11 +6920,19 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= TRACE_MAX_PRINT) - s->len = TRACE_MAX_PRINT; + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ - s->buffer[s->len] = 0; + s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); @@ -6752,7 +7073,6 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; - if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -6850,6 +7170,18 @@ out: return ret; } +void __init trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (WARN_ON(!tracepoint_print_iter)) + tracepoint_printk = 0; + } + tracer_alloc_buffers(); + trace_event_init(); +} + __init static int clear_boot_tracer(void) { /* @@ -6869,6 +7201,5 @@ __init static int clear_boot_tracer(void) return 0; } -early_initcall(tracer_alloc_buffers); -fs_initcall(tracer_init_debugfs); +fs_initcall(tracer_init_tracefs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9258f5a815db..d2612016de94 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -14,6 +14,7 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> #include <linux/compiler.h> +#include <linux/trace_seq.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -333,12 +334,13 @@ struct tracer_flags { /** - * struct tracer - a specific tracer and its callbacks to interact with debugfs + * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer * @start: called when tracing is unpaused (echo 1 > tracing_enabled) * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened * @close: called when the trace file is released @@ -357,6 +359,7 @@ struct tracer { void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); + int (*update_thresh)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); @@ -385,6 +388,7 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; + int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE @@ -538,7 +542,6 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; @@ -567,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc); - -void tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *cur, - unsigned long flags, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -595,9 +589,6 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); -void tracing_sched_switch_assign_trace(struct trace_array *tr); -void tracing_stop_sched_switch_record(void); -void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -717,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; +extern char trace_find_mark(unsigned long long duration); + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -735,7 +728,7 @@ extern unsigned long trace_flags; extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); -extern enum print_line_t +extern void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); @@ -1308,4 +1301,20 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, #define perf_ftrace_event_register NULL #endif +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +#else +static inline void init_ftrace_syscalls(void) { } +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +void trace_event_enum_update(struct trace_enum_map **map, int len); +#else +static inline void __init trace_event_init(void) { } +static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { } +#endif + +extern struct trace_iterator *tracepoint_print_iter; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f0..57cbf1efdd44 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -7,7 +7,6 @@ #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/irqflags.h> -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> @@ -151,22 +150,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } static void branch_print_header(struct seq_file *s) { seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" - " FUNC:FILE:LINE\n"); - seq_puts(s, "# | | | | | " - " |\n"); + " FUNC:FILE:LINE\n" + "# | | | | | " + " |\n"); } static struct trace_event_functions trace_branch_funcs = { @@ -233,12 +231,12 @@ extern unsigned long __stop_annotated_branch_profile[]; static int annotated_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " correct incorrect % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } @@ -274,7 +272,7 @@ static int branch_stat_show(struct seq_file *m, void *v) seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) - seq_printf(m, " X "); + seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); @@ -362,12 +360,12 @@ extern unsigned long __stop_branch_profile[]; static int all_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " miss hit %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " miss hit % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->fmt), FILTER_OTHER @@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), FILTER_OTHER @@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, __field( const char *, str ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->str), FILTER_OTHER diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 5d12bb407b44..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return ret; } + /* + * We checked and allowed to create parent, + * allow children without checking. + */ + if (p_event->parent) + return 0; + + /* + * It's ok to check current process (owner) permissions in here, + * because code below is called only via perf_event_open syscall. + */ + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) @@ -249,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) } void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) + struct pt_regs **regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -268,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; + if (regs) + *regs = this_cpu_ptr(&__perf_regs[*rctxp]); raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2de53628689f..3ab69fb72b85 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,10 +8,12 @@ * */ +#define pr_fmt(fmt) fmt + #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/kthread.h> -#include <linux/debugfs.h> +#include <linux/tracefs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> @@ -210,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); + +static void output_printk(struct ftrace_event_buffer *fbuffer) +{ + struct ftrace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + if (!iter) + return; + + event_call = fbuffer->ftrace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->ftrace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) { + if (tracepoint_printk) + output_printk(fbuffer); + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); @@ -446,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) return; if (!--dir->nr_events) { - debugfs_remove_recursive(dir->entry); + tracefs_remove_recursive(dir->entry); list_del(&dir->list); __put_system_dir(dir); } @@ -459,13 +493,13 @@ static void remove_event_file_dir(struct ftrace_event_file *file) if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &dir->d_subdirs, d_child) { if (child->d_inode) /* probably unneeded */ child->d_inode->i_private = NULL; } spin_unlock(&dir->d_lock); - debugfs_remove_recursive(dir); + tracefs_remove_recursive(dir); } list_del(&file->list); @@ -531,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; + int ret; /* * The buf format can be <subsystem>:<event-name> @@ -556,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set); + + /* Put back the colon to allow this to be called again */ + if (buf) + *(buf - 1) = ':'; + + return ret; } /** @@ -916,7 +957,7 @@ static int f_show(struct seq_file *m, void *v) case FORMAT_HEADER: seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); - seq_printf(m, "format:\n"); + seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: @@ -1042,7 +1083,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_unlock(&event_mutex); if (file) - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1208,7 +1250,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); print_subsystem_event_filter(system, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1263,7 +1306,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); func(s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1489,9 +1533,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = debugfs_create_dir(name, parent); + dir->entry = tracefs_create_dir(name, parent); if (!dir->entry) { - pr_warning("Failed to create system directory %s\n", name); + pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } @@ -1502,12 +1546,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->subsystem = system; file->system = dir; - entry = debugfs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs '%s/filter' entry\n", name); + pr_warn("Could not create tracefs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, @@ -1522,8 +1566,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) - pr_warning("No memory to create event subsystem %s\n", - name); + pr_warn("No memory to create event subsystem %s\n", name); return NULL; } @@ -1549,10 +1592,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) d_events = parent; name = ftrace_event_name(call); - file->dir = debugfs_create_dir(name, d_events); + file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { - pr_warning("Could not create debugfs '%s' directory\n", - name); + pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } @@ -1575,8 +1617,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) if (list_empty(head)) { ret = call->class->define_fields(call); if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", name); + pr_warn("Could not initialize trace point events/%s\n", + name); return -1; } } @@ -1621,7 +1663,6 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); - destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1649,8 +1690,7 @@ static int event_init(struct ftrace_event_call *call) if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) - pr_warn("Could not initialize trace events/%s\n", - name); + pr_warn("Could not initialize trace events/%s\n", name); } return ret; @@ -1671,6 +1711,131 @@ __register_event(struct ftrace_event_call *call, struct module *mod) return 0; } +static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +{ + int rlen; + int elen; + + /* Find the length of the enum value as a string */ + elen = snprintf(ptr, 0, "%ld", map->enum_value); + /* Make sure there's enough room to replace the string with the value */ + if (len < elen) + return NULL; + + snprintf(ptr, elen + 1, "%ld", map->enum_value); + + /* Get the rest of the string of ptr */ + rlen = strlen(ptr + len); + memmove(ptr + elen, ptr + len, rlen); + /* Make sure we end the new string */ + ptr[elen + rlen] = 0; + + return ptr + elen; +} + +static void update_event_printk(struct ftrace_event_call *call, + struct trace_enum_map *map) +{ + char *ptr; + int quote = 0; + int len = strlen(map->enum_string); + + for (ptr = call->print_fmt; *ptr; ptr++) { + if (*ptr == '\\') { + ptr++; + /* paranoid */ + if (!*ptr) + break; + continue; + } + if (*ptr == '"') { + quote ^= 1; + continue; + } + if (quote) + continue; + if (isdigit(*ptr)) { + /* skip numbers */ + do { + ptr++; + /* Check for alpha chars like ULL */ + } while (isalnum(*ptr)); + if (!*ptr) + break; + /* + * A number must have some kind of delimiter after + * it, and we can ignore that too. + */ + continue; + } + if (isalpha(*ptr) || *ptr == '_') { + if (strncmp(map->enum_string, ptr, len) == 0 && + !isalnum(ptr[len]) && ptr[len] != '_') { + ptr = enum_replace(ptr, map, len); + /* Hmm, enum string smaller than value */ + if (WARN_ON_ONCE(!ptr)) + return; + /* + * No need to decrement here, as enum_replace() + * returns the pointer to the character passed + * the enum, and two enums can not be placed + * back to back without something in between. + * We can skip that something in between. + */ + continue; + } + skip_more: + do { + ptr++; + } while (isalnum(*ptr) || *ptr == '_'); + if (!*ptr) + break; + /* + * If what comes after this variable is a '.' or + * '->' then we can continue to ignore that string. + */ + if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { + ptr += *ptr == '.' ? 1 : 2; + if (!*ptr) + break; + goto skip_more; + } + /* + * Once again, we can skip the delimiter that came + * after the string. + */ + continue; + } + } +} + +void trace_event_enum_update(struct trace_enum_map **map, int len) +{ + struct ftrace_event_call *call, *p; + const char *last_system = NULL; + int last_i; + int i; + + down_write(&trace_event_sem); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + /* events are usually grouped together with systems */ + if (!last_system || call->class->system != last_system) { + last_i = 0; + last_system = call->class->system; + } + + for (i = last_i; i < len; i++) { + if (call->class->system == map[i]->system) { + /* Save the first system if need be */ + if (!last_i) + last_i = i; + update_event_printk(call, map[i]); + } + } + } + up_write(&trace_event_sem); +} + static struct ftrace_event_file * trace_create_new_event(struct ftrace_event_call *call, struct trace_array *tr) @@ -1749,7 +1914,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_call_preds(call); + free_event_filter(call->filter); + call->filter = NULL; } static int probe_remove_event_call(struct ftrace_event_call *call) @@ -1881,7 +2047,7 @@ static int trace_module_notify(struct notifier_block *self, static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, - .priority = 0, + .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ @@ -1895,8 +2061,8 @@ __trace_add_event_dirs(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(call)); } } @@ -1989,7 +2155,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, ftrace_event_name(data->file->event_call)); if (data->count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", data->count); @@ -2194,7 +2360,7 @@ static inline int register_event_cmds(void) { return 0; } /* * The top level array has already had its ftrace_event_file * descriptors created in order to allow for early events to - * be recorded. This function is called after the debugfs has been + * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated * to the events. */ @@ -2208,8 +2374,8 @@ __trace_early_add_event_dirs(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) - pr_warning("Could not create directory for event %s\n", - ftrace_event_name(file->event_call)); + pr_warn("Could not create directory for event %s\n", + ftrace_event_name(file->event_call)); } } @@ -2232,8 +2398,8 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) - pr_warning("Could not create early event %s\n", - ftrace_event_name(call)); + pr_warn("Could not create early event %s\n", + ftrace_event_name(call)); } } @@ -2277,16 +2443,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = debugfs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { - pr_warning("Could not create debugfs 'set_event' entry\n"); + pr_warn("Could not create tracefs 'set_event' entry\n"); return -ENOMEM; } - d_events = debugfs_create_dir("events", parent); + d_events = tracefs_create_dir("events", parent); if (!d_events) { - pr_warning("Could not create debugfs 'events' directory\n"); + pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } @@ -2378,7 +2544,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - debugfs_remove_recursive(tr->event_dir); + tracefs_remove_recursive(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; @@ -2395,12 +2561,39 @@ static __init int event_trace_memsetup(void) return 0; } +static __init void +early_enable_events(struct trace_array *tr, bool disable_first) +{ + char *buf = bootup_event_buf; + char *token; + int ret; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); + + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; - char *buf = bootup_event_buf; - char *token; int ret; if (!tr) @@ -2422,18 +2615,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); - } + early_enable_events(tr, false); trace_printk_start_comm(); @@ -2444,6 +2626,31 @@ static __init int event_trace_enable(void) return 0; } +/* + * event_trace_enable() is called from trace_event_init() first to + * initialize events and perhaps start any events that are on the + * command line. Unfortunately, there are some events that will not + * start this early, like the system call tracepoints that need + * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() + * is called before pid 1 starts, and this flag is never set, making + * the syscall tracepoint never get reached, but the event is enabled + * regardless (and not doing anything). + */ +static __init int event_trace_enable_again(void) +{ + struct trace_array *tr; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + early_enable_events(tr, true); + + return 0; +} + +early_initcall(event_trace_enable_again); + static __init int event_trace_init(void) { struct trace_array *tr; @@ -2456,17 +2663,16 @@ static __init int event_trace_init(void) return -ENODEV; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); + pr_warn("Could not create tracefs 'available_events' entry\n"); if (trace_define_common_fields()) - pr_warning("tracing: Failed to allocate common fields"); + pr_warn("tracing: Failed to allocate common fields"); ret = early_event_add_tracer(d_tracer, tr); if (ret) @@ -2475,12 +2681,18 @@ static __init int event_trace_init(void) #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) - pr_warning("Failed to register trace events module notifier\n"); + pr_warn("Failed to register trace events module notifier\n"); #endif return 0; } -early_initcall(event_trace_memsetup); -core_initcall(event_trace_enable); + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); +} + fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2515,8 +2727,11 @@ static __init int event_test_thread(void *unused) kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) + while (!kthread_should_stop()) { schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); return 0; } @@ -2579,7 +2794,7 @@ static __init void event_trace_self_tests(void) * it and the self test should not be on. */ if (file->flags & FTRACE_EVENT_FL_ENABLED) { - pr_warning("Enabled event during self test!\n"); + pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } @@ -2607,8 +2822,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling system %s\n", - system->name); + pr_warn("error enabling system %s\n", + system->name); continue; } @@ -2616,8 +2831,8 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling system %s\n", - system->name); + pr_warn("error disabling system %s\n", + system->name); continue; } @@ -2631,7 +2846,7 @@ static __init void event_trace_self_tests(void) ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling all events\n"); + pr_warn("error enabling all events\n"); return; } @@ -2640,7 +2855,7 @@ static __init void event_trace_self_tests(void) /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling all events\n"); + pr_warn("error disabling all events\n"); return; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8631926a07..ced69da0ff55 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -45,6 +45,7 @@ enum filter_op_ids OP_GT, OP_GE, OP_BAND, + OP_NOT, OP_NONE, OP_OPEN_PAREN, }; @@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = { { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, { OP_BAND, "&", 6 }, + { OP_NOT, "!", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -85,6 +87,7 @@ enum { FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, FILT_ERR_IP_FIELD_ONLY, + FILT_ERR_ILLEGAL_NOT_OP, }; static char *err_text[] = { @@ -101,6 +104,7 @@ static char *err_text[] = { "Missing field name and/or value", "Meaningless filter expression", "Only 'ip' field is supported for function trace", + "Illegal use of '!'", }; struct opstack_op { @@ -139,6 +143,7 @@ struct pred_stack { int index; }; +/* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ break; \ } \ \ - return match; \ + return !!match == !pred->not; \ } #define DEFINE_EQUALITY_PRED(size) \ @@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds, if (!WARN_ON_ONCE(!pred->fn)) match = pred->fn(pred, rec); if (!!match == type) - return match; + break; } - return match; + /* If not of not match is equal to not of not, then it is a match */ + return !!match == !op->not; } struct filter_match_preds_data { @@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter, * then this op can be folded. */ if (left->index & FILTER_PRED_FOLD && - (left->op == dest->op || + ((left->op == dest->op && !left->not) || left->left == FILTER_PRED_INVALID) && right->index & FILTER_PRED_FOLD && - (right->op == dest->op || + ((right->op == dest->op && !right->not) || right->left == FILTER_PRED_INVALID)) dest->index |= FILTER_PRED_FOLD; @@ -774,17 +780,12 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void call_filter_disable(struct ftrace_event_call *call) -{ - call->flags &= ~TRACE_EVENT_FL_FILTERED; -} - static void filter_disable(struct ftrace_event_file *file) { struct ftrace_event_call *call = file->event_call; if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call_filter_disable(call); + call->flags &= ~TRACE_EVENT_FL_FILTERED; else file->flags &= ~FTRACE_EVENT_FL_FILTERED; } @@ -804,32 +805,6 @@ void free_event_filter(struct event_filter *filter) __free_filter(filter); } -void destroy_call_preds(struct ftrace_event_call *call) -{ - __free_filter(call->filter); - call->filter = NULL; -} - -static void destroy_file_preds(struct ftrace_event_file *file) -{ - __free_filter(file->filter); - file->filter = NULL; -} - -/* - * Called when destroying the ftrace_event_file. - * The file is being freed, so we do not need to worry about - * the file being currently used. This is for module code removing - * the tracepoints from within it. - */ -void destroy_preds(struct ftrace_event_file *file) -{ - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - destroy_call_preds(file->event_call); - else - destroy_file_preds(file); -} - static struct event_filter *__alloc_filter(void) { struct event_filter *filter; @@ -873,17 +848,14 @@ static inline void __remove_filter(struct ftrace_event_file *file) remove_filter_string(file->filter); } -static void filter_free_subsystem_preds(struct event_subsystem *system, +static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; - __remove_filter(file); } } @@ -901,15 +873,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) } } -static void filter_free_subsystem_filters(struct event_subsystem *system, +static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, struct trace_array *tr) { struct ftrace_event_file *file; - struct ftrace_event_call *call; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; __free_subsystem_filter(file); } @@ -1064,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps, } if (pred->op == OP_NE) - pred->not = 1; + pred->not ^= 1; pred->fn = fn; return 0; @@ -1582,7 +1552,6 @@ static int fold_pred_tree(struct event_filter *filter, static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, - char *filter_string, bool dry_run) { char *operand1 = NULL, *operand2 = NULL; @@ -1627,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call, continue; } + if (elt->op == OP_NOT) { + if (!n_preds || operand1 || operand2) { + parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); + err = -EINVAL; + goto fail; + } + if (!dry_run) + filter->preds[n_preds - 1].not ^= 1; + continue; + } + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); err = -ENOSPC; @@ -1755,13 +1735,12 @@ struct filter_list { struct event_filter *filter; }; -static int replace_system_preds(struct event_subsystem *system, +static int replace_system_preds(struct ftrace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_file *file; - struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; LIST_HEAD(filter_list); @@ -1769,15 +1748,14 @@ static int replace_system_preds(struct event_subsystem *system, int err; list_for_each_entry(file, &tr->events, list) { - call = file->event_call; - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; /* * Try to see if the filter can be applied * (filter arg is ignored on dry_run) */ - err = replace_preds(call, NULL, ps, filter_string, true); + err = replace_preds(file->event_call, NULL, ps, true); if (err) event_set_no_set_filter_flag(file); else @@ -1787,9 +1765,7 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; - call = file->event_call; - - if (strcmp(call->class->system, system->name) != 0) + if (file->system != dir) continue; if (event_no_set_filter_flag(file)) @@ -1811,7 +1787,7 @@ static int replace_system_preds(struct event_subsystem *system, if (err) goto fail_mem; - err = replace_preds(call, filter, ps, filter_string, false); + err = replace_preds(file->event_call, filter, ps, false); if (err) { filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); @@ -1933,7 +1909,7 @@ static int create_filter(struct ftrace_event_call *call, err = create_filter_start(filter_str, set_str, &ps, &filter); if (!err) { - err = replace_preds(call, filter, ps, filter_str, false); + err = replace_preds(call, filter, ps, false); if (err && set_str) append_filter_err(ps, filter); } @@ -1959,7 +1935,7 @@ int create_event_filter(struct ftrace_event_call *call, * Identical to create_filter() except that it creates a subsystem filter * and always remembers @filter_str. */ -static int create_system_filter(struct event_subsystem *system, +static int create_system_filter(struct ftrace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { @@ -1969,7 +1945,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, tr, ps, filter_str); + err = replace_system_preds(dir, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -2053,18 +2029,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, tr); + filter_free_subsystem_preds(dir, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system, tr); + filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, tr, filter_string, &filter); + err = create_system_filter(dir, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4747b476a030..8712df9decb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m, { long count = (long)data; - seq_printf(m, "%s", name); + seq_puts(m, name); if (count == -1) seq_puts(m, ":unlimited"); @@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m, if (filter_str) seq_printf(m, " if %s\n", filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, if (data->filter_str) seq_printf(m, " if %s\n", data->filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4ddde28a81a..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -6,12 +6,10 @@ #include <linux/stringify.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/fs.h> #include "trace_output.h" @@ -179,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ }, \ .event.type = etype, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 57f0ec962d2c..fcd41a166405 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static int update_count(void **data) +static void update_traceon_count(void **data, bool on) { - unsigned long *count = (long *)data; + long *count = (long *)data; + long old_count = *count; - if (!*count) - return 0; + /* + * Tracing gets disabled (or enabled) once per count. + * This function can be called at the same time on multiple CPUs. + * It is fine if both disable (or enable) tracing, as disabling + * (or enabling) the second time doesn't do anything as the + * state of the tracer is already disabled (or enabled). + * What needs to be synchronized in this case is that the count + * only gets decremented once, even if the tracer is disabled + * (or enabled) twice, as the second one is really a nop. + * + * The memory barriers guarantee that we only decrement the + * counter once. First the count is read to a local variable + * and a read barrier is used to make sure that it is loaded + * before checking if the tracer is in the state we want. + * If the tracer is not in the state we want, then the count + * is guaranteed to be the old count. + * + * Next the tracer is set to the state we want (disabled or enabled) + * then a write memory barrier is used to make sure that + * the new state is visible before changing the counter by + * one minus the old counter. This guarantees that another CPU + * executing this code will see the new state before seeing + * the new counter value, and would not do anything if the new + * counter is seen. + * + * Note, there is no synchronization between this and a user + * setting the tracing_on file. But we currently don't care + * about that. + */ + if (!old_count) + return; - if (*count != -1) - (*count)--; + /* Make sure we see count before checking tracing state */ + smp_rmb(); - return 1; + if (on == !!tracing_is_on()) + return; + + if (on) + tracing_on(); + else + tracing_off(); + + /* unlimited? */ + if (old_count == -1) + return; + + /* Make sure tracing state is visible before updating count */ + smp_wmb(); + + *count = old_count - 1; } static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (tracing_is_on()) - return; - - if (update_count(data)) - tracing_on(); + update_traceon_count(data, 1); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; - - if (update_count(data)) - tracing_off(); + update_traceon_count(data, 0); } static void @@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; + long *count = (long *)data; + long old_count; + long new_count; - if (update_count(data)) - trace_dump_stack(STACK_SKIP); + /* + * Stack traces should only execute the number of times the + * user specified in the counter. + */ + do { + + if (!tracing_is_on()) + return; + + old_count = *count; + + if (!old_count) + return; + + /* unlimited? */ + if (old_count == -1) { + trace_dump_stack(STACK_SKIP); + return; + } + + new_count = old_count - 1; + new_count = cmpxchg(count, old_count, new_count); + if (new_count == old_count) + trace_dump_stack(STACK_SKIP); + + } while (new_count != old_count); +} + +static int update_count(void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return 0; + + if (*count != -1) + (*count)--; + + return 1; } static void @@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m, seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4de3e57f723c..a51e79688455 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -6,7 +6,6 @@ * is Copyright (c) Steven Rostedt <srostedt@redhat.com> * */ -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/slab.h> @@ -15,6 +14,33 @@ #include "trace.h" #include "trace_output.h" +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ + return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + kill_ftrace_graph = true; +} + /* When set, irq functions will be ignored */ static int ftrace_graph_skip_irqs; @@ -80,7 +106,7 @@ enum { FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags); @@ -92,6 +118,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long long calltime; int index; + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + if (!current->ret_stack) return -EBUSY; @@ -121,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, * The curr_ret_stack is initialized to -1 and get increased * in this function. So it can be less than -1 only if it was * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in debugfs by user. + * set from set_graph_notrace file in tracefs by user. */ if (current->curr_ret_stack < -1) return -EBUSY; @@ -323,7 +352,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) { if (tracing_thresh) return 1; @@ -412,7 +441,7 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } -void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) @@ -445,35 +474,32 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +static int graph_trace_update_thresh(struct trace_array *tr) +{ + graph_trace_reset(tr); + return graph_trace_init(tr); +} + static int max_bytes_for_cpu; -static enum print_line_t -print_graph_cpu(struct trace_seq *s, int cpu) +static void print_graph_cpu(struct trace_seq *s, int cpu) { - int ret; - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); } #define TRACE_GRAPH_PROCINFO_LENGTH 14 -static enum print_line_t -print_graph_proc(struct trace_seq *s, pid_t pid) +static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ char pid_str[11]; int spaces = 0; - int ret; int len; int i; @@ -488,56 +514,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid) spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; /* First spaces to align center */ - for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < spaces / 2; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%s-%s", comm, pid_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s-%s", comm, pid_str); /* Last spaces to align center */ - for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = 0; i < spaces - (spaces / 2); i++) + trace_seq_putc(s, ' '); } -static enum print_line_t -print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - if (!trace_seq_putc(s, ' ')) - return 0; - - return trace_print_lat_fmt(s, entry); + trace_seq_putc(s, ' '); + trace_print_lat_fmt(s, entry); } /* If the pid changed since the last trace, output this event */ -static enum print_line_t +static void verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) { pid_t prev_pid; pid_t *last_pid; - int ret; if (!data) - return TRACE_TYPE_HANDLED; + return; last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) - return TRACE_TYPE_HANDLED; + return; prev_pid = *last_pid; *last_pid = pid; if (prev_pid == -1) - return TRACE_TYPE_HANDLED; + return; /* * Context-switch trace line: @@ -546,33 +559,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_puts(s, - " ------------------------------------------\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, prev_pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " => "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, - "\n ------------------------------------------\n\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " ------------------------------------------\n"); + print_graph_cpu(s, cpu); + print_graph_proc(s, prev_pid); + trace_seq_puts(s, " => "); + print_graph_proc(s, pid); + trace_seq_puts(s, "\n ------------------------------------------\n\n"); } static struct ftrace_graph_ret_entry * @@ -646,175 +638,122 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -static int print_graph_abs_time(u64 t, struct trace_seq *s) +static void print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; usecs_rem = do_div(t, NSEC_PER_SEC); usecs_rem /= 1000; - return trace_seq_printf(s, "%5lu.%06lu | ", - (unsigned long)t, usecs_rem); + trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); } -static enum print_line_t +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { - int ret; struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) - return TRACE_TYPE_UNHANDLED; + return; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, pid); + trace_seq_puts(s, " | "); } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); } /* No overhead */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) - ret = trace_seq_puts(s, "==========>"); + trace_seq_puts(s, "==========>"); else - ret = trace_seq_puts(s, "<=========="); + trace_seq_puts(s, "<=========="); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_putc(s, '\n'); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + print_graph_duration(0, s, flags | FLAGS_FILL_END); + trace_seq_putc(s, '\n'); } -enum print_line_t +void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ - char msecs_str[21]; + char usecs_str[21]; char nsecs_str[5]; - int ret, len; + int len; int i; - sprintf(msecs_str, "%lu", (unsigned long) duration); + sprintf(usecs_str, "%lu", (unsigned long) duration); /* Print msecs */ - ret = trace_seq_printf(s, "%s", msecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s", usecs_str); - len = strlen(msecs_str); + len = strlen(usecs_str); /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); snprintf(nsecs_str, slen, "%03lu", nsecs_rem); - ret = trace_seq_printf(s, ".%s", nsecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, ".%s", nsecs_str); len += strlen(nsecs_str); } - ret = trace_seq_puts(s, " us "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = len; i < 7; i++) + trace_seq_putc(s, ' '); } -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags) { - int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION) || !(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return TRACE_TYPE_HANDLED; + return; /* No real adata, just filling the column with spaces */ switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { case FLAGS_FILL_FULL: - ret = trace_seq_puts(s, " | "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " | "); + return; case FLAGS_FILL_START: - ret = trace_seq_puts(s, " "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " "); + return; case FLAGS_FILL_END: - ret = trace_seq_puts(s, " |"); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " |"); + return; } /* Signal a overhead of time execution to the output */ - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - ret = trace_seq_puts(s, "! "); - /* Duration exceeded 10 msecs */ - else if (duration > 10000ULL) - ret = trace_seq_puts(s, "+ "); - } - - /* - * The -1 means we either did not exceed the duration tresholds - * or we dont want to print out the overhead. Either way we need - * to fill out the space. - */ - if (ret == -1) - ret = trace_seq_puts(s, " "); - - /* Catching here any failure happenned above */ - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_print_graph_duration(duration, s); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_puts(s, "| "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) + trace_seq_printf(s, "%c ", trace_find_mark(duration)); + else + trace_seq_puts(s, " "); - return TRACE_TYPE_HANDLED; + trace_print_graph_duration(duration, s); + trace_seq_puts(s, "| "); } /* Case of a leaf function on its call entry */ @@ -828,7 +767,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - int ret; int i; graph_ret = &ret_entry->ret; @@ -854,22 +792,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%ps();\n", (void *)call->func); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -879,7 +810,6 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; - int ret; int i; if (data) { @@ -895,19 +825,15 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); - if (!ret) + trace_seq_printf(s, "%ps() {\n", (void *)call->func); + + if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; /* @@ -917,62 +843,43 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_NO_CONSUME; } -static enum print_line_t +static void print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, int type, unsigned long addr, u32 flags) { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; int cpu = iter->cpu; - int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + verif_pid(s, ent->pid, cpu, data); - if (type) { + if (type) /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + print_graph_irq(iter, addr, type, cpu, ent->pid, flags); if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return 0; + return; /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, ent->pid); + trace_seq_puts(s, " | "); } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - ret = print_graph_lat_fmt(s, ent); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); - return 0; + return; } /* @@ -1090,8 +997,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) @@ -1124,7 +1030,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; - int ret; int i; if (check_irq_return(iter, flags, trace->depth)) @@ -1150,20 +1055,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } } - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Closing brace */ - for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* * If the return function does not have a matching entry, @@ -1172,30 +1071,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. Always do * that if the funcgraph-tail option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { - ret = trace_seq_puts(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } else { - ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); /* Overrun */ - if (flags & TRACE_GRAPH_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)\n", - trace->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_OVERRUN) + trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, - cpu, pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -1212,26 +1101,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (data) depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) - for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* The comment */ - ret = trace_seq_puts(s, "/* "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, "/* "); switch (iter->ent->type) { case TRACE_BPRINT: @@ -1254,17 +1135,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return ret; } + if (trace_seq_has_overflowed(s)) + goto out; + /* Strip ending newline */ - if (s->buffer[s->len - 1] == '\n') { - s->buffer[s->len - 1] = '\0'; - s->len--; + if (s->buffer[s->seq.len - 1] == '\n') { + s->buffer[s->seq.len - 1] = '\0'; + s->seq.len--; } - ret = trace_seq_puts(s, " */\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " */\n"); + out: + return trace_handle_return(s); } @@ -1371,35 +1253,35 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) print_lat_header(s, flags); /* 1st line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " TIME "); + seq_puts(s, " TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " CPU"); + seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_puts(s, " TASK/PID "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " DURATION "); - seq_printf(s, " FUNCTION CALLS\n"); + seq_puts(s, " DURATION "); + seq_puts(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_puts(s, " | | "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " | | "); - seq_printf(s, " | | | |\n"); + seq_puts(s, " | | "); + seq_puts(s, " | | | |\n"); } -void print_graph_headers(struct seq_file *s) +static void print_graph_headers(struct seq_file *s) { print_graph_headers_flags(s, tracer_flags.val); } @@ -1426,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ struct fgraph_data *data; + gfp_t gfpflags; int cpu; iter->private = NULL; - data = kzalloc(sizeof(*data), GFP_KERNEL); + /* We can be called in atomic context via ftrace_dump() */ + gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + + data = kzalloc(sizeof(*data), gfpflags); if (!data) goto out_err; - data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags); if (!data->cpu_data) goto out_err_free; @@ -1495,6 +1381,7 @@ static struct trace_event graph_trace_ret_event = { static struct tracer graph_trace __tracer_data = { .name = "function_graph", + .update_thresh = graph_trace_update_thresh, .open = graph_trace_open, .pipe_open = graph_trace_open, .close = graph_trace_close, @@ -1548,12 +1435,12 @@ static const struct file_operations graph_depth_fops = { .llseek = generic_file_llseek, }; -static __init int init_graph_debugfs(void) +static __init int init_graph_tracefs(void) { struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("max_graph_depth", 0644, d_tracer, @@ -1561,7 +1448,7 @@ static __init int init_graph_debugfs(void) return 0; } -fs_initcall(init_graph_debugfs); +fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9bb104f748d0..8523ea345f2b 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -10,11 +10,9 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/kallsyms.h> -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> -#include <linux/fs.h> #include "trace.h" diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index bd90e1b06088..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); @@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } - if (!trace_empty(&iter)) - trace_find_next_entry_inc(&iter); - while (!trace_empty(&iter)) { + + while (trace_find_next_entry_inc(&iter)) { if (!cnt) kdb_printf("---------------------------------\n"); cnt++; - if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + if (!skip_lines) { print_trace_line(&iter); - if (!skip_lines) trace_printk_seq(&iter.seq); - else + } else { skip_lines--; + } + if (KDB_FLAG(CMD_INTERRUPT)) goto out; } @@ -86,9 +88,12 @@ out: atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - for_each_tracing_cpu(cpu) - if (iter.buffer_iter[cpu]) + for_each_tracing_cpu(cpu) { + if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); + iter.buffer_iter[cpu] = NULL; + } + } } /* @@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_REPEAT_NONE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f6e4e5539..d0ce590f06e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) #define fetch_file_offset_string_size NULL /* Fetch type information table */ -const struct fetch_type kprobes_fetch_type_table[] = { +static const struct fetch_type kprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true); + is_return, true, + kprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -826,7 +827,7 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tk->tp.call.class->system, ftrace_event_name(&tk->tp.call)); @@ -840,7 +841,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tk->tp.nr_args; i++) seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1024,27 +1025,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; - - if (!trace_seq_puts(s, "\n")) - goto partial; + goto out; - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); } static enum print_line_t @@ -1060,33 +1056,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, " <- ")) - goto partial; + trace_seq_puts(s, " <- "); if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; + goto out; - if (!trace_seq_puts(s, "\n")) - goto partial; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } @@ -1144,11 +1135,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1158,7 +1153,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; @@ -1175,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1189,7 +1188,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; @@ -1296,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; call->data = tk; ret = trace_add_event_call(call); @@ -1320,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } -/* Make a debugfs interface for controlling probe points */ +/* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; @@ -1330,23 +1329,23 @@ static __init int init_kprobe_trace(void) return -EINVAL; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, d_tracer, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_events' entry\n"); /* Profile interface */ - entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_profile' entry\n"); return 0; } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b863474..7a9ba62e9fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr) mmio_reset_data(tr); } -static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) { - int ret = 0; int i; resource_size_t start, end; const struct pci_driver *drv = pci_dev_driver(dev); - /* XXX: incomplete checks for trace_seq_printf() return value */ - ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", - dev->bus->number, dev->devfn, - dev->vendor, dev->device, dev->irq); + trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); /* * XXX: is pci_resource_to_user() appropriate, since we are * supposed to interpret the __ioremap() phys_addr argument based on @@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) */ for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); } if (drv) - ret += trace_seq_printf(s, " %s\n", drv->name); + trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_puts(s, " \n"); - return ret; + trace_seq_puts(s, " \n"); } static void destroy_header_iter(struct header_iter *hiter) @@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret = 1; trace_assign_type(field, entry); rw = &field->rw; switch (rw->opcode) { case MMIO_READ: - ret = trace_seq_printf(s, + trace_seq_printf(s, "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," "%02lx 0x%lx %d\n", secs, usec_rem, rw->map_id, @@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_puts(s, "rw what?\n"); + trace_seq_puts(s, "rw what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_map(struct trace_iterator *iter) @@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; trace_assign_type(field, entry); m = &field->map; switch (m->opcode) { case MMIO_PROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNMAP %u.%06lu %d 0x%lx %d\n", secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_puts(s, "map what?\n"); + trace_seq_puts(s, "map what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_mark(struct trace_iterator *iter) @@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t mmio_print_line(struct trace_iterator *iter) diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index fcf0a9e48916..8bb2071474dd 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -6,8 +6,6 @@ */ #include <linux/module.h> -#include <linux/fs.h> -#include <linux/debugfs.h> #include <linux/ftrace.h> #include "trace.h" diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f3dad80c20b2..692bf7184c8c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,37 +20,17 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -int trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - int ret; - - ret = seq_write(m, s->buffer, len); - - /* - * Only reset this buffer if we successfully wrote to the - * seq_file buffer. - */ - if (!ret) - trace_seq_init(s); - - return ret; -} - enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bputs_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, field->str); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -58,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bprint_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) @@ -74,266 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct print_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (s->full || !len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_printf); - -/** - * trace_seq_bitmask - put a list of longs as a bitmask print output - * @s: trace sequence descriptor - * @maskp: points to an array of unsigned longs that represent a bitmask - * @nmaskbits: The number of bits that are valid in @maskp - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * Writes a ASCII representation of a bitmask string into @s. - */ -int -trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, - int nmaskbits) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_bitmask); - -/** - * trace_seq_vprintf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = vsnprintf(s->buffer + s->len, len, fmt, args); + trace_seq_puts(s, field->buf); - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} -EXPORT_SYMBOL_GPL(trace_seq_vprintf); - -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -int trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -int trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - s->buffer[s->len++] = c; - - return 1; -} -EXPORT_SYMBOL(trace_seq_putc); - -int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) -{ - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - int i, j; - - if (s->full) - return 0; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -void *trace_seq_reserve(struct trace_seq *s, size_t len) -{ - void *ret; - - if (s->full) - return NULL; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return NULL; - } - - ret = s->buffer + s->len; - s->len += len; - - return ret; -} - -int trace_seq_path(struct trace_seq *s, const struct path *path) -{ - unsigned char *p; - - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - s->full = 1; - return 0; + return trace_handle_return(s); } const char * @@ -343,7 +66,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -379,7 +102,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -390,9 +113,9 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%lx", val); - + trace_seq_putc(p, 0); return ret; @@ -405,7 +128,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; symbol_array[i].name; i++) { @@ -416,7 +139,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (ret == (const char *)(p->buffer + p->len)) + if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); @@ -430,7 +153,7 @@ const char * ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size) { - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); trace_seq_putc(p, 0); @@ -443,7 +166,7 @@ const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; - const char *ret = p->buffer + p->len; + const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); @@ -454,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +const char * +ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len, + size_t el_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + const char *prefix = ""; + void *ptr = (void *)buf; + + trace_seq_putc(p, '{'); + + while (ptr < buf + buf_len) { + switch (el_size) { + case 1: + trace_seq_printf(p, "%s0x%x", prefix, + *(u8 *)ptr); + break; + case 2: + trace_seq_printf(p, "%s0x%x", prefix, + *(u16 *)ptr); + break; + case 4: + trace_seq_printf(p, "%s0x%x", prefix, + *(u32 *)ptr); + break; + case 8: + trace_seq_printf(p, "%s0x%llx", prefix, + *(u64 *)ptr); + break; + default: + trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size, + *(u8 *)ptr); + el_size = 1; + } + prefix = ","; + ptr += el_size; + } + + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_array_seq); + int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { @@ -461,7 +228,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct trace_seq *p = &iter->tmp_seq; struct trace_entry *entry; - int ret; event = container_of(trace_event, struct ftrace_event_call, event); entry = iter->ent; @@ -472,11 +238,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - return 0; + return trace_handle_return(s); } EXPORT_SYMBOL(ftrace_raw_output_prep); @@ -484,18 +248,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; - int ret; - ret = trace_seq_printf(s, "%s: ", name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", name); + trace_seq_vprintf(s, fmt, ap); - ret = trace_seq_vprintf(s, fmt, ap); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) @@ -528,7 +285,7 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static int +static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS @@ -539,12 +296,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } -static int +static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -555,9 +311,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, sprint_symbol(str, address); name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } #ifndef CONFIG_64BIT @@ -588,14 +343,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (file) { ret = trace_seq_path(s, &file->f_path); if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", - ip - vmstart); + trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); } up_read(&mm->mmap_sem); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + return !trace_seq_has_overflowed(s); } int @@ -603,7 +358,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags) { struct mm_struct *mm = NULL; - int ret = 1; unsigned int i; if (trace_flags & TRACE_ITER_SYM_USEROBJ) { @@ -622,48 +376,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { unsigned long ip = entry->caller[i]; - if (ip == ULONG_MAX || !ret) + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) break; - if (ret) - ret = trace_seq_puts(s, " => "); + + trace_seq_puts(s, " => "); + if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - if (ret) - ret = trace_seq_putc(s, '\n'); + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); continue; } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_putc(s, '\n'); + + seq_print_user_ip(s, mm, ip, sym_flags); + trace_seq_putc(s, '\n'); } if (mm) mmput(mm); - return ret; + + return !trace_seq_has_overflowed(s); } int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { - int ret; - - if (!ip) - return trace_seq_putc(s, '0'); + if (!ip) { + trace_seq_putc(s, '0'); + goto out; + } if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); + seq_print_sym_offset(s, "%s", ip); else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; + seq_print_sym_short(s, "%s", ip); if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + + out: + return !trace_seq_has_overflowed(s); } /** @@ -681,7 +432,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; - int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -713,16 +463,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.'; - if (!trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq)) - return 0; + trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq); if (entry->preempt_count) - ret = trace_seq_printf(s, "%x", entry->preempt_count); + trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_putc(s, '.'); + trace_seq_putc(s, '.'); - return ret; + return !trace_seq_has_overflowed(s); } static int @@ -732,14 +481,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - if (!trace_seq_printf(s, "%8.8s-%-5d %3d", - comm, entry->pid, cpu)) - return 0; + trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh_us = 100; +#undef MARK +#define MARK(v, s) {.val = v, .sym = s} +/* trace overhead mark */ +static const struct trace_mark { + unsigned long long val; /* unit: nsec */ + char sym; +} mark[] = { + MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(1000000ULL , '#'), /* 1000 usecs */ + MARK(100000ULL , '!'), /* 100 usecs */ + MARK(10000ULL , '+'), /* 10 usecs */ +}; +#undef MARK + +char trace_find_mark(unsigned long long d) +{ + int i; + int size = ARRAY_SIZE(mark); + + for (i = 0; i < size; i++) { + if (d >= mark[i].val) + break; + } + + return (i == size) ? ' ' : mark[i].sym; +} static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) @@ -761,24 +534,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); unsigned long rel_msec = (unsigned long)rel_ts; - return trace_seq_printf( - s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", - ns2usecs(iter->ts), - abs_msec, abs_usec, - rel_msec, rel_usec); + trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { - return trace_seq_printf( - s, "[%016llx] %lld (+%lld): ", - iter->ts, abs_ts, rel_ts); + trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { - return trace_seq_printf( - s, " %4lldus%c: ", - abs_ts, - rel_ts > preempt_mark_thresh_us ? '!' : - rel_ts > 1 ? '+' : ' '); + trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + trace_find_mark(rel_ts * NSEC_PER_USEC)); + } else { /* !verbose && !in_ns */ - return trace_seq_printf(s, " %4lld: ", abs_ts); + trace_seq_printf(s, " %4lld: ", abs_ts); } + + return !trace_seq_has_overflowed(s); } int trace_print_context(struct trace_iterator *iter) @@ -788,34 +565,29 @@ int trace_print_context(struct trace_iterator *iter) unsigned long long t; unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; - int ret; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (!ret) - return 0; - if (trace_flags & TRACE_ITER_IRQ_INFO) { - ret = trace_print_lat_fmt(s, entry); - if (!ret) - return 0; - } + if (trace_flags & TRACE_ITER_IRQ_INFO) + trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { t = ns2usecs(iter->ts); usec_rem = do_div(t, USEC_PER_SEC); secs = (unsigned long)t; - return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); } else - return trace_seq_printf(s, " %12llu: ", iter->ts); + trace_seq_printf(s, " %12llu: ", iter->ts); + + return !trace_seq_has_overflowed(s); } int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; - int ret; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; @@ -835,18 +607,17 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", - comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { - ret = lat_print_generic(s, entry, iter->cpu); + lat_print_generic(s, entry, iter->cpu); } - if (ret) - ret = lat_print_timestamp(iter, next_ts); + lat_print_timestamp(iter, next_ts); - return ret; + return !trace_seq_has_overflowed(s); } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; @@ -960,7 +731,7 @@ int register_ftrace_event(struct trace_event *event) goto out; } else { - + event->type = next_event_type++; list = &ftrace_event_list; } @@ -1032,10 +803,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { - if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } /* TRACE_FN */ @@ -1047,24 +817,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_puts(s, " <-")) - goto partial; - if (!seq_print_ip_sym(s, - field->parent_ip, - flags)) - goto partial; + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); } - if (!trace_seq_putc(s, '\n')) - goto partial; - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, @@ -1074,12 +836,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "%lx %lx\n", - field->ip, - field->parent_ip)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, @@ -1090,10 +851,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + SEQ_PUT_HEX_FIELD(s, field->ip); + SEQ_PUT_HEX_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, @@ -1104,10 +865,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); + SEQ_PUT_FIELD(s, field->ip); + SEQ_PUT_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_fn_funcs = { @@ -1136,18 +897,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, T = task_state_char(field->next_state); S = task_state_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); - if (!trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, delim, - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, @@ -1172,17 +932,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) if (!S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, @@ -1210,15 +969,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); + SEQ_PUT_HEX_FIELD(s, field->prev_pid); + SEQ_PUT_HEX_FIELD(s, field->prev_prio); + SEQ_PUT_HEX_FIELD(s, S); + SEQ_PUT_HEX_FIELD(s, field->next_cpu); + SEQ_PUT_HEX_FIELD(s, field->next_pid); + SEQ_PUT_HEX_FIELD(s, field->next_prio); + SEQ_PUT_HEX_FIELD(s, T); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, @@ -1241,14 +1000,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); + SEQ_PUT_FIELD(s, field->prev_pid); + SEQ_PUT_FIELD(s, field->prev_prio); + SEQ_PUT_FIELD(s, field->prev_state); + SEQ_PUT_FIELD(s, field->next_cpu); + SEQ_PUT_FIELD(s, field->next_pid); + SEQ_PUT_FIELD(s, field->next_prio); + SEQ_PUT_FIELD(s, field->next_state); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_ctx_funcs = { @@ -1288,23 +1048,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); - if (!trace_seq_puts(s, "<stack trace>\n")) - goto partial; + trace_seq_puts(s, "<stack trace>\n"); for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { - if (!trace_seq_puts(s, " => ")) - goto partial; - if (!seq_print_ip_sym(s, *p, flags)) - goto partial; - if (!trace_seq_putc(s, '\n')) - goto partial; - } + if (trace_seq_has_overflowed(s)) + break; - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " => "); + seq_print_ip_sym(s, *p, flags); + trace_seq_putc(s, '\n'); + } - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_stack_funcs = { @@ -1325,16 +1081,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "<user stack trace>\n")) - goto partial; - - if (!seq_print_userip_objs(field, s, flags)) - goto partial; + trace_seq_puts(s, "<user stack trace>\n"); + seq_print_userip_objs(field, s, flags); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_user_stack_funcs = { @@ -1357,19 +1107,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_puts(s, field->str)) - goto partial; - - return TRACE_TYPE_HANDLED; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_puts(s, field->str); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1382,16 +1124,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_puts(s, field->str); - if (!trace_seq_puts(s, field->str)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bputs_funcs = { @@ -1415,19 +1151,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_puts(s, ": ")) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_bprintf(s, field->fmt, field->buf); - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1440,16 +1168,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_bprintf(s, field->fmt, field->buf); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bprint_funcs = { @@ -1471,16 +1193,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_printf(s, ": %s", field->buf); - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, @@ -1490,13 +1206,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(&iter->seq); } static struct trace_event_functions trace_print_funcs = { diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 127a9d8c8357..8ef2c40efb3c 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,21 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); extern int __unregister_ftrace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; -#define MAX_MEMHEX_BYTES 8 -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) +#define SEQ_PUT_FIELD(s, x) \ + trace_seq_putmem(s, &(x), sizeof(x)) + +#define SEQ_PUT_HEX_FIELD(s, x) \ + trace_seq_putmem_hex(s, &(x), sizeof(x)) #endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65c..36c1455b7567 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -5,7 +5,6 @@ * */ #include <linux/seq_file.h> -#include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/ftrace.h> @@ -15,7 +14,6 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/fs.h> #include "trace.h" @@ -305,7 +303,7 @@ static int t_show(struct seq_file *m, void *v) seq_puts(m, "\\t"); break; case '\\': - seq_puts(m, "\\"); + seq_putc(m, '\\'); break; case '"': seq_puts(m, "\\\""); @@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("printk_formats", 0444, d_tracer, diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d4b9fc22cd27..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -40,7 +40,8 @@ const char *reserved_field_names[] = { int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + return !trace_seq_has_overflowed(s); \ } \ const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); @@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, int len = *(u32 *)data >> 16; if (!len) - return trace_seq_printf(s, " %s=(fault)", name); + trace_seq_printf(s, " %s=(fault)", name); else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); + trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); + return !trace_seq_has_overflowed(s); } NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); @@ -354,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe) + struct fetch_param *f, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; int ret = 0; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - switch (arg[0]) { case '$': ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); @@ -445,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, dprm->fetch_size = get_fetch_size_function(t, dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe); + is_kprobe, ftbl); if (ret) kfree(dprm); else { @@ -503,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe) + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; const char *t; int ret; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -533,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, } parg->offset = *size; *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, + is_kprobe, ftbl); if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..ab283e146b70 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -25,7 +25,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/debugfs.h> +#include <linux/tracefs.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> @@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -/* - * Fetch type information table. - * It's declared as a weak symbol due to conditional compilation. - */ -extern __weak const struct fetch_type kprobes_fetch_type_table[]; -extern __weak const struct fetch_type uprobes_fetch_type_table[]; - #ifdef CONFIG_KPROBE_EVENT struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); @@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) } extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe); + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9b40f3..419ca37e72c9 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -5,8 +5,6 @@ * */ #include <linux/module.h> -#include <linux/fs.h> -#include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> @@ -14,122 +12,26 @@ #include "trace.h" -static struct trace_array *ctx_trace; -static int __read_mostly tracer_enabled; static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); -static int sched_stopped; - - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); -} static void probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); - - local_irq_restore(flags); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - - event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu, pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(current); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_wakeup_trace(ctx_trace, wakee, current, - flags, pc); - - local_irq_restore(flags); } static int tracing_sched_register(void) @@ -197,51 +99,3 @@ void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(); } - -/** - * tracing_start_sched_switch_record - start tracing context switches - * - * Turns on context switch tracing for a tracer. - */ -void tracing_start_sched_switch_record(void) -{ - if (unlikely(!ctx_trace)) { - WARN_ON(1); - return; - } - - tracing_start_sched_switch(); - - mutex_lock(&sched_register_mutex); - tracer_enabled++; - mutex_unlock(&sched_register_mutex); -} - -/** - * tracing_stop_sched_switch_record - start tracing context switches - * - * Turns off context switch tracing for a tracer. - */ -void tracing_stop_sched_switch_record(void) -{ - mutex_lock(&sched_register_mutex); - tracer_enabled--; - WARN_ON(tracer_enabled < 0); - mutex_unlock(&sched_register_mutex); - - tracing_stop_sched_switch(); -} - -/** - * tracing_sched_switch_assign_trace - assign a trace array for ctx switch - * @tr: trace array pointer to assign - * - * Some tracers might want to record the context switches in their - * trace. This function lets those tracers assign the trace array - * to use. - */ -void tracing_sched_switch_assign_trace(struct trace_array *tr) -{ - ctx_trace = tr; -} - diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 19bd8928ce94..d6e1003724e9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -10,8 +10,6 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/module.h> -#include <linux/fs.h> -#include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> @@ -365,6 +363,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) wakeup_current_cpu = cpu; } +static void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + static void notrace probe_wakeup_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..b0f86ea77881 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; tracing_start(); /* we should only have one item */ @@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; trace->reset(tr); tracing_start(); @@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #endif #ifdef CONFIG_SCHED_TRACER + +struct wakeup_test_data { + struct completion is_ready; + int go; +}; + static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ @@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data) .sched_deadline = 10000000ULL, .sched_period = 10000000ULL }; - struct completion *x = data; + struct wakeup_test_data *x = data; sched_setattr(current, &attr); /* Make it know we have a new prio */ - complete(x); + complete(&x->is_ready); /* now go to sleep and let the test wake us up */ set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!x->go) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } - complete(x); + complete(&x->is_ready); + + set_current_state(TASK_INTERRUPTIBLE); /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { - /* - * This will likely be the system top priority - * task, do short sleeps to let others run. - */ - msleep(100); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); + return 0; } - int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tr->max_latency; struct task_struct *p; - struct completion is_ready; + struct wakeup_test_data data; unsigned long count; int ret; - init_completion(&is_ready); + memset(&data, 0, sizeof(data)); + + init_completion(&data.is_ready); /* create a -deadline thread */ - p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); + p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } /* make sure the thread is running at -deadline policy */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); } - init_completion(&is_ready); + init_completion(&data.is_ready); + + data.go = 1; + /* memory barrier is in the wake_up_process() */ wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(&tr->trace_buffer, NULL); - printk("ret = %d\n", ret); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 000000000000..e694c9f9efa4 --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,377 @@ +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + * + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) + +/* How much buffer is written? */ +#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) + +/* + * trace_seq should work with being initialized with 0s. + */ +static inline void __trace_seq_init(struct trace_seq *s) +{ + if (unlikely(!s->seq.size)) + trace_seq_init(s); +} + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int ret; + + __trace_seq_init(s); + + ret = seq_buf_print_seq(m, &s->seq); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. This lets the caller try again or + * do something else with the contents. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + unsigned int save_len = s->seq.len; + va_list ap; + + if (s->full) + return; + + __trace_seq_init(s); + + va_start(ap, fmt); + seq_buf_vprintf(&s->seq, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + */ +void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_vprintf(&s->seq, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + */ +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_bprintf(&s->seq, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_puts(struct trace_seq *s, const char *str) +{ + unsigned int len = strlen(str); + + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, str, len); +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple charater + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return; + } + + seq_buf_putc(&s->seq, c); +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + */ +void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, mem, len); +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + */ +void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + /* Each byte is represented by two chars */ + if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + /* The added spaces can still cause an overflow */ + seq_buf_putmem_hex(&s->seq, mem, len); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_path(&s->seq, path, "\n"); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the squence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequenc (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ + __trace_seq_init(s); + return seq_buf_to_user(&s->seq, ubuf, cnt); +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -7,13 +7,10 @@ #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/uaccess.h> -#include <linux/debugfs.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/init.h> -#include <linux/fs.h> -#include <linux/magic.h> #include <asm/setup.h> @@ -171,8 +168,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } @@ -331,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) local_irq_enable(); } -static int trace_lookup_stack(struct seq_file *m, long i) +static void trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pS\n", (void *)addr); + seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) @@ -464,7 +460,7 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("stack_max_size", 0644, d_tracer, diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 7af67360b330..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -12,7 +12,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/rbtree.h> -#include <linux/debugfs.h> +#include <linux/tracefs.h> #include "trace_stat.h" #include "trace.h" @@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) static void destroy_session(struct stat_session *session) { - debugfs_remove(session->file); + tracefs_remove(session->file); __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); @@ -276,12 +276,12 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); - if (!d_tracing) + if (IS_ERR(d_tracing)) return 0; - stat_dir = debugfs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'trace_stat' entry\n"); return 0; } @@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && tracing_stat_init()) return -ENODEV; - session->file = debugfs_create_file(session->ts->name, 0644, + session->file = tracefs_create_file(session->ts->name, 0644, stat_dir, session, &tracing_stat_fops); if (!session->file) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 759d5e004517..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, ret, syscall; + int i, syscall; trace = (typeof(trace))ent; syscall = trace->nr; @@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } - ret = trace_seq_printf(s, "%s(", entry->name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) { - ret = trace_seq_printf(s, "%s ", entry->types[i]); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", entry->types[i]); + /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); } - ret = trace_seq_putc(s, ')'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - + trace_seq_putc(s, ')'); end: - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, struct syscall_trace_exit *trace; int syscall; struct syscall_metadata *entry; - int ret; trace = (typeof(trace))ent; syscall = trace->nr; @@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, if (!entry) { trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + goto out; } if (entry->exit_event->event.type != ent->type) { @@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return TRACE_TYPE_UNHANDLED; } - ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + out: + return trace_handle_return(s); } extern char *__bad_type_size(void); @@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - rcu_assign_pointer(tr->enter_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - rcu_assign_pointer(tr->exit_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); @@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -static int __init init_ftrace_syscalls(void) +void __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; @@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void) GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return -ENOMEM; + return; } for (i = 0; i < NR_syscalls; i++) { @@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void) meta->syscall_nr = i; syscalls_metadata[i] = meta; } - - return 0; } -early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS @@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -586,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size -= sizeof(u32); rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, regs, &rctx); + sys_data->enter_event->event.type, NULL, &rctx); if (!rec) return; @@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; @@ -659,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size -= sizeof(u32); rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, regs, &rctx); + sys_data->exit_event->event.type, NULL, &rctx); if (!rec) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3c9b97e6b1f4..d60fe62ec4fa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) DEFINE_FETCH_file_offset(string_size) /* Fetch type information table */ -const struct fetch_type uprobes_fetch_type_table[] = { +static const struct fetch_type uprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -536,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false); + is_return, false, + uprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -553,8 +553,7 @@ error: return ret; fail_address_parse: - if (inode) - iput(inode); + iput(inode); pr_info("Failed to parse address or file.\n"); @@ -607,7 +606,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -853,16 +852,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[1], entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -870,14 +867,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct probe_arg *parg = &tu->tp.args[i]; if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto partial; + goto out; } - if (trace_seq_puts(s, "\n")) - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } typedef bool (*filter_func_t)(struct uprobe_consumer *self, @@ -1010,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1020,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1028,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1051,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely @@ -1116,7 +1112,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) goto out; @@ -1292,7 +1288,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); @@ -1326,7 +1322,7 @@ static __init int init_uprobe_trace(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); - if (!d_tracer) + if (IS_ERR(d_tracer)) return 0; trace_create_file("uprobe_events", 0644, d_tracer, diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1b1327..975cb49e32bf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; - struct timespec uptime, ts; cputime_t utime, stime, utimescaled, stimescaled; - u64 ac_etime; + u64 delta; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; + /* calculate task elapsed time in nsec */ + delta = ktime_get_ns() - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime = delta; + /* Convert to seconds for btime */ + do_div(delta, USEC_PER_SEC); + stats->ac_btime = get_seconds() - delta; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbceff..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); void user_return_notifier_register(struct user_return_notifier *urn) { set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); - hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); + hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); } EXPORT_SYMBOL_GPL(user_return_notifier_register); @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); void user_return_notifier_unregister(struct user_return_notifier *urn) { hlist_del(&urn->link); - if (hlist_empty(&__get_cpu_var(return_notifier_list))) + if (hlist_empty(this_cpu_ptr(&return_notifier_list))) clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); } EXPORT_SYMBOL_GPL(user_return_notifier_unregister); diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e44..b069ccbfb0b0 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .proc_inum = PROC_USER_INIT_INO, + .ns.inum = PROC_USER_INIT_INO, +#ifdef CONFIG_USER_NS + .ns.ops = &userns_operations, +#endif + .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..4109f8320684 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -24,6 +24,7 @@ #include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - ret = proc_alloc_inum(&ns->proc_inum); + ret = ns_alloc_inum(&ns->ns); if (ret) { kmem_cache_free(user_ns_cachep, ns); return ret; } + ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ + mutex_lock(&userns_state_mutex); + ns->flags = parent_ns->flags; + mutex_unlock(&userns_state_mutex); + set_cred_user_ns(new, ns); #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); ns = parent; } while (atomic_dec_and_test(&parent->count)); @@ -526,21 +533,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } - -static DEFINE_MUTEX(id_map_mutex); - static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; /* - * The id_map_mutex serializes all writes to any given map. + * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); ret = -EPERM; /* Only allow one successful write to the map */ @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!page) goto out; - /* Only allow <= page size writes at the beginning of the file */ + /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, *ppos = count; ret = count; out: - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); if (page) free_page(page); return ret; @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + const struct cred *cred = file->f_cred; + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && + uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->fsuid)) + if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, file->f_cred->fsgid)) + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && + gid_eq(gid, cred->egid)) return true; } } @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, return false; } -static void *userns_get(struct task_struct *task) +int proc_setgroups_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + unsigned long userns_flags = ACCESS_ONCE(ns->flags); + + seq_printf(seq, "%s\n", + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? + "allow" : "deny"); + return 0; +} + +ssize_t proc_setgroups_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + char kbuf[8], *pos; + bool setgroups_allowed; + ssize_t ret; + + /* Only allow a very narrow range of strings to be written */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= sizeof(kbuf))) + goto out; + + /* What was written? */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + ret = -EINVAL; + if (strncmp(pos, "allow", 5) == 0) { + pos += 5; + setgroups_allowed = true; + } + else if (strncmp(pos, "deny", 4) == 0) { + pos += 4; + setgroups_allowed = false; + } + else + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + ret = -EPERM; + mutex_lock(&userns_state_mutex); + if (setgroups_allowed) { + /* Enabling setgroups after setgroups has been disabled + * is not allowed. + */ + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) + goto out_unlock; + } else { + /* Permanently disabling setgroups after setgroups has + * been enabled by writing the gid_map is not allowed. + */ + if (ns->gid_map.nr_extents != 0) + goto out_unlock; + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; + } + mutex_unlock(&userns_state_mutex); + + /* Report a successful write */ + *ppos = count; + ret = count; +out: + return ret; +out_unlock: + mutex_unlock(&userns_state_mutex); + goto out; +} + +bool userns_may_setgroups(const struct user_namespace *ns) +{ + bool allowed; + + mutex_lock(&userns_state_mutex); + /* It is not safe to use setgroups until a gid mapping in + * the user namespace has been established. + */ + allowed = ns->gid_map.nr_extents != 0; + /* Is setgroups allowed? */ + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); + mutex_unlock(&userns_state_mutex); + + return allowed; +} + +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + +static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); - return user_ns; + return user_ns ? &user_ns->ns : NULL; } -static void userns_put(void *ns) +static void userns_put(struct ns_common *ns) { - put_user_ns(ns); + put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, void *ns) +static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - struct user_namespace *user_ns = ns; + struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } -static unsigned int userns_inum(void *ns) -{ - struct user_namespace *user_ns = ns; - return user_ns->proc_inum; -} - const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, - .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..831ea7108232 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) { kfree(ns); return ERR_PTR(err); } + ns->ns.ops = &utsns_operations; + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -84,34 +86,39 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kfree(ns); } -static void *utsns_get(struct task_struct *task) +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); - return ns; + return ns ? &ns->ns : NULL; } -static void utsns_put(void *ns) +static void utsns_put(struct ns_common *ns) { - put_uts_ns(ns); + put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, void *new) +static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) { - struct uts_namespace *ns = new; + struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } -static unsigned int utsns_inum(void *vp) -{ - struct uts_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, - .inum = utsns_inum, }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..2316f50b07a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -15,11 +15,6 @@ #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/lockdep.h> -#include <linux/notifier.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/smpboot.h> @@ -29,8 +24,33 @@ #include <linux/kvm_para.h> #include <linux/perf_event.h> -int watchdog_user_enabled = 1; +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#else +static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +#endif +int __read_mostly nmi_watchdog_enabled; +int __read_mostly soft_watchdog_enabled; +int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #else @@ -47,6 +67,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); +static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); @@ -62,6 +83,18 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} static int __init hardlockup_panic_setup(char *str) { @@ -70,7 +103,9 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_user_enabled = 0; + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -89,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); -/* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_user_enabled = 0; + watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; return 1; } __setup("nosoftlockup", nosoftlockup_setup); -/* */ + #ifdef CONFIG_SMP static int __init softlockup_all_cpu_backtrace_setup(char *str) { @@ -131,7 +165,7 @@ static int get_softlockup_thresh(void) */ static unsigned long get_timestamp(void) { - return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ + return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) @@ -185,7 +219,7 @@ void touch_nmi_watchdog(void) * case we shouldn't have to worry about the watchdog * going off. */ - __raw_get_cpu_var(watchdog_nmi_touch) = true; + raw_cpu_write(watchdog_nmi_touch, true); touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -194,8 +228,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog); void touch_softlockup_watchdog_sync(void) { - __raw_get_cpu_var(softlockup_touch_sync) = true; - __raw_get_cpu_var(watchdog_touch_ts) = 0; + __this_cpu_write(softlockup_touch_sync, true); + __this_cpu_write(watchdog_touch_ts, 0); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -216,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + get_softlockup_thresh())) - return now - touch_ts; - + if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + /* Warn about unreasonable delays. */ + if (time_after(now, touch_ts + get_softlockup_thresh())) + return now - touch_ts; + } return 0; } @@ -260,9 +295,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -331,8 +368,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) + if (__this_cpu_read(soft_watchdog_warn) == true) { + /* + * When multiple processes are causing softlockups the + * softlockup detector only warns on the first one + * because the code relies on a full quiet cycle to + * re-arm. The second process prevents the quiet cycle + * and never gets reported. Use task pointers to detect + * this. + */ + if (__this_cpu_read(softlockup_task_ptr_saved) != + current) { + __this_cpu_write(soft_watchdog_warn, false); + __touch_watchdog(); + } return HRTIMER_RESTART; + } if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already @@ -345,9 +396,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); if (regs) @@ -366,6 +418,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -384,7 +437,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -404,7 +457,7 @@ static void watchdog_enable(unsigned int cpu) static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); @@ -436,6 +489,21 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); + + /* + * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the + * failure path. Check for failures that can occur asynchronously - + * for example, when CPUs are on-lined - and shut down the hardware + * perf event on each CPU accordingly. + * + * The only non-obvious place this bit can be cleared is through + * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a + * pr_info here would be too noisy as it would result in a message + * every few seconds if the hardlockup was disabled but the softlockup + * enabled. + */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + watchdog_nmi_disable(cpu); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -451,6 +519,10 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -476,6 +548,18 @@ static int watchdog_nmi_enable(unsigned int cpu) goto out_save; } + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + /* skip displaying the same error again */ if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) return PTR_ERR(event); @@ -484,11 +568,14 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + return PTR_ERR(event); /* success path */ @@ -511,11 +598,42 @@ static void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); } - return; + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} + +void watchdog_nmi_enable_all(void) +{ + int cpu; + + if (!watchdog_user_enabled) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_enable(cpu); + put_online_cpus(); +} + +void watchdog_nmi_disable_all(void) +{ + int cpu; + + if (!watchdog_running) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_disable(cpu); + put_online_cpus(); } #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } +void watchdog_nmi_enable_all(void) {} +void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -531,7 +649,7 @@ static struct smp_hotplug_thread watchdog_threads = { static void restart_watchdog_hrtimer(void *info) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); int ret; /* @@ -546,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info) HRTIMER_MODE_REL_PINNED); } -static void update_timers(int cpu) +static void update_watchdog(int cpu) { /* * Make sure that perf event counter will adopt to a new @@ -561,17 +679,17 @@ static void update_timers(int cpu) watchdog_nmi_enable(cpu); } -static void update_timers_all_cpus(void) +static void update_watchdog_all_cpus(void) { int cpu; get_online_cpus(); for_each_online_cpu(cpu) - update_timers(cpu); + update_watchdog(cpu); put_online_cpus(); } -static int watchdog_enable_all_cpus(bool sample_period_changed) +static int watchdog_enable_all_cpus(void) { int err = 0; @@ -581,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; - } else if (sample_period_changed) { - update_timers_all_cpus(); + } else { + /* + * Enable/disable the lockup detectors or + * change the sample period 'on the fly'. + */ + update_watchdog_all_cpus(); } return err; @@ -600,39 +722,149 @@ static void watchdog_disable_all_cpus(void) } /* - * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh + * Update the run state of the lockup detectors. + */ +static int proc_watchdog_update(void) +{ + int err = 0; + + /* + * Watchdog threads won't be started if they are already active. + * The 'watchdog_running' variable in watchdog_*_all_cpus() takes + * care of this. If those threads are already active, the sample + * period will be updated and the lockup detectors will be enabled + * or disabled 'on the fly'. + */ + if (watchdog_enabled && watchdog_thresh) + err = watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + + return err; + +} + +static DEFINE_MUTEX(watchdog_proc_mutex); + +/* + * common function for watchdog, nmi_watchdog and soft_watchdog parameter + * + * caller | table->data points to | 'which' contains the flag(s) + * -------------------|-----------------------|----------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed + * | | with SOFT_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED + * -------------------|-----------------------|----------------------------- + * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + */ +static int proc_watchdog_common(int which, struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err, old, new; + int *watchdog_param = (int *)table->data; + + mutex_lock(&watchdog_proc_mutex); + + /* + * If the parameter is being read return the state of the corresponding + * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the + * run state of the lockup detectors. + */ + if (!write) { + *watchdog_param = (watchdog_enabled & which) != 0; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } else { + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err) + goto out; + + /* + * There is a race window between fetching the current value + * from 'watchdog_enabled' and storing the new value. During + * this race window, watchdog_nmi_enable() can sneak in and + * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. + * The 'cmpxchg' detects this race and the loop retries. + */ + do { + old = watchdog_enabled; + /* + * If the parameter value is not zero set the + * corresponding bit(s), else clear it(them). + */ + if (*watchdog_param) + new = old | which; + else + new = old & ~which; + } while (cmpxchg(&watchdog_enabled, old, new) != old); + + /* + * Update the run state of the lockup detectors. + * Restore 'watchdog_enabled' on failure. + */ + err = proc_watchdog_update(); + if (err) + watchdog_enabled = old; + } +out: + mutex_unlock(&watchdog_proc_mutex); + return err; +} + +/* + * /proc/sys/kernel/watchdog + */ +int proc_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} + +/* + * /proc/sys/kernel/soft_watchdog */ +int proc_soft_watchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + table, write, buffer, lenp, ppos); +} -int proc_dowatchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +/* + * /proc/sys/kernel/watchdog_thresh + */ +int proc_watchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old_thresh, old_enabled; - static DEFINE_MUTEX(watchdog_proc_mutex); + int err, old; mutex_lock(&watchdog_proc_mutex); - old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_user_enabled); + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err || !write) goto out; - set_sample_period(); /* - * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_running' variable check in - * watchdog_*_all_cpus() function takes care of this. + * Update the sample period. + * Restore 'watchdog_thresh' on failure. */ - if (watchdog_user_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - else - watchdog_disable_all_cpus(); - - /* Restore old values on failure */ - if (err) { - watchdog_thresh = old_thresh; - watchdog_user_enabled = old_enabled; - } + set_sample_period(); + err = proc_watchdog_update(); + if (err) + watchdog_thresh = old; out: mutex_unlock(&watchdog_proc_mutex); return err; @@ -643,6 +875,6 @@ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_user_enabled) - watchdog_enable_all_cpus(false); + if (watchdog_enabled) + watchdog_enable_all_cpus(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 35974ac69600..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -159,6 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ + struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -230,7 +231,7 @@ struct wq_device; */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ - struct list_head list; /* PL: list of all workqueues */ + struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ @@ -257,6 +258,13 @@ struct workqueue_struct { #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + /* + * Destruction of workqueue_struct is sched-RCU protected to allow + * walking the workqueues list without grabbing wq_pool_mutex. + * This is used to dump all workqueues from sysrq. + */ + struct rcu_head rcu; + /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ @@ -265,7 +273,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ @@ -289,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ @@ -325,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); +static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -758,13 +766,6 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; - /* - * nr_idle and idle_list may disagree if idle rebinding is in - * progress. Never return %true if idle_list is empty. - */ - if (list_empty(&pool->idle_list)) - return false; - return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -850,7 +851,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) return NULL; /* @@ -874,35 +875,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set - * @wakeup: wakeup an idle worker if necessary * - * Set @flags in @worker->flags and adjust nr_running accordingly. If - * nr_running becomes zero and @wakeup is %true, an idle worker is - * woken up. + * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags, - bool wakeup) +static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); - /* - * If transitioning into NOT_RUNNING, adjust nr_running and - * wake up an idle worker as necessary if requested by - * @wakeup. - */ + /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - if (wakeup) { - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - wake_up_worker(pool); - } else - atomic_dec(&pool->nr_running); + atomic_dec(&pool->nr_running); } worker->flags |= flags; @@ -1232,7 +1220,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_delayed_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); + pwq_dec_nr_in_flight(pwq, get_work_color(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -1560,7 +1548,7 @@ static void worker_enter_idle(struct worker *worker) (worker->hentry.next || worker->hentry.pprev))) return; - /* can't use worker_set_flags(), also called from start_worker() */ + /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; @@ -1602,11 +1590,11 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -static struct worker *alloc_worker(void) +static struct worker *alloc_worker(int node) { struct worker *worker; - worker = kzalloc(sizeof(*worker), GFP_KERNEL); + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); @@ -1670,6 +1658,9 @@ static void worker_detach_from_pool(struct worker *worker, detach_completion = pool->detach_completion; mutex_unlock(&pool->attach_mutex); + /* clear leftover flags without pool->lock after it is detached */ + worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); + if (detach_completion) complete(detach_completion); } @@ -1678,8 +1669,7 @@ static void worker_detach_from_pool(struct worker *worker, * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is attached to @pool. The new worker must be - * started by start_worker(). + * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1698,7 +1688,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (id < 0) goto fail; - worker = alloc_worker(); + worker = alloc_worker(pool->node); if (!worker) goto fail; @@ -1724,6 +1714,13 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); + /* start the newly created worker */ + spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); + spin_unlock_irq(&pool->lock); + return worker; fail: @@ -1734,44 +1731,6 @@ fail: } /** - * start_worker - start a newly created worker - * @worker: worker to start - * - * Make the pool aware of @worker and start it. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void start_worker(struct worker *worker) -{ - worker->pool->nr_workers++; - worker_enter_idle(worker); - wake_up_process(worker->task); -} - -/** - * create_and_start_worker - create and start a worker for a pool - * @pool: the target pool - * - * Grab the managership of @pool and create and start a new worker for it. - * - * Return: 0 on success. A negative error code otherwise. - */ -static int create_and_start_worker(struct worker_pool *pool) -{ - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - } - - return worker ? 0 : -ENOMEM; -} - -/** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * @@ -1854,8 +1813,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); + spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -1868,8 +1827,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock(&pool->lock); - spin_unlock_irq(&wq_mayday_lock); + spin_unlock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -1891,17 +1850,11 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { - if (!need_to_create_worker(pool)) - return false; restart: spin_unlock_irq(&pool->lock); @@ -1909,23 +1862,10 @@ restart: mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { - struct worker *worker; - - worker = create_worker(pool); - if (worker) { - del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&pool->lock); - start_worker(worker); - if (WARN_ON_ONCE(need_to_create_worker(pool))) - goto restart; - return true; - } - - if (!need_to_create_worker(pool)) + if (create_worker(pool) || !need_to_create_worker(pool)) break; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CREATE_COOLDOWN); + schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; @@ -1933,9 +1873,13 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); + /* + * This is necessary even after a new worker was just successfully + * created as @pool->lock was dropped and the new worker might have + * already become busy. + */ if (need_to_create_worker(pool)) goto restart; - return true; } /** @@ -1955,16 +1899,14 @@ restart: * multiple times. Does GFP_KERNEL allocations. * * Return: - * %false if the pool don't need management and the caller can safely start - * processing works, %true indicates that the function released pool->lock - * and reacquired it to perform some management function and that the - * conditions that the caller verified while holding the lock before - * calling the function might no longer be true. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Anyone who successfully grabs manager_arb wins the arbitration @@ -1977,12 +1919,14 @@ static bool manage_workers(struct worker *worker) * actual management, the pool may stall indefinitely. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; + pool->manager = worker; - ret |= maybe_create_worker(pool); + maybe_create_worker(pool); + pool->manager = NULL; mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** @@ -2020,13 +1964,8 @@ __acquires(&pool->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif - /* - * Ensure we're on the correct CPU. DISASSOCIATED test is - * necessary to avoid spurious warnings from rescuers servicing the - * unbound or a disassociated pool. - */ - WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && - !(pool->flags & POOL_DISASSOCIATED) && + /* ensure we're on the correct CPU */ + WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* @@ -2052,17 +1991,22 @@ __acquires(&pool->lock) list_del_init(&work->entry); /* - * CPU intensive works don't participate in concurrency - * management. They're the scheduler's responsibility. + * CPU intensive works don't participate in concurrency management. + * They're the scheduler's responsibility. This takes @worker out + * of concurrency management and the next code block will chain + * execution of the pending work items. */ if (unlikely(cpu_intensive)) - worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Unbound pool isn't concurrency managed and work items should be - * executed ASAP. Wake up another worker if necessary. + * Wake up another worker if necessary. The condition is always + * false for normal per-cpu workers since nr_running would always + * be >= 1 at this point. This is used to chain execution of the + * pending work items for WORKER_NOT_RUNNING workers such as the + * UNBOUND and CPU_INTENSIVE ones. */ - if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); /* @@ -2101,9 +2045,10 @@ __acquires(&pool->lock) * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in - * stop_machine. + * stop_machine. At the same time, report a quiescent RCU state so + * the same condition doesn't freeze RCU. */ - cond_resched(); + cond_resched_rcu_qs(); spin_lock_irq(&pool->lock); @@ -2218,7 +2163,7 @@ recheck: } } while (keep_working(pool)); - worker_set_flags(worker, WORKER_PREP, false); + worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to @@ -2305,35 +2250,51 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); - process_scheduled_works(rescuer); - spin_unlock_irq(&pool->lock); - - worker_detach_from_pool(rescuer, pool); + if (!list_empty(scheduled)) { + process_scheduled_works(rescuer); - spin_lock_irq(&pool->lock); + /* + * The above execution of rescued work items could + * have created more to rescue through + * pwq_activate_first_delayed() or chained + * queueing. Let's put @pwq back on mayday list so + * that such back-to-back work items, which may be + * being used to relieve memory pressure, don't + * incur MAYDAY_INTERVAL delay inbetween. + */ + if (need_to_create_worker(pool)) { + spin_lock(&wq_mayday_lock); + get_pwq(pwq); + list_move_tail(&pwq->mayday_node, &wq->maydays); + spin_unlock(&wq_mayday_lock); + } + } /* * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're holding its lock. + * go away while we're still attached to it. */ put_pwq(pwq); /* - * Leave this pool. If keep_working() is %true, notify a + * Leave this pool. If need_more_worker() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(pool)) + if (need_more_worker(pool)) wake_up_worker(pool); rescuer->pool = NULL; - spin_unlock(&pool->lock); - spin_lock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&wq_mayday_lock); } spin_unlock_irq(&wq_mayday_lock); @@ -2353,6 +2314,7 @@ repeat: struct wq_barrier { struct work_struct work; struct completion done; + struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) @@ -2401,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + barr->task = current; /* * If @target is currently being executed, schedule the @@ -2778,19 +2741,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2799,6 +2800,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } @@ -2991,324 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -#ifdef CONFIG_SYSFS -/* - * Workqueues with WQ_SYSFS flag set is visible to userland via - * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the - * following attributes. - * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items - * - * Unbound workqueues have the following extra attributes. - * - * id RO int : the associated pool ID - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers - */ -struct wq_device { - struct workqueue_struct *wq; - struct device dev; -}; - -static struct workqueue_struct *dev_to_wq(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - return wq_dev->wq; -} - -static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); -} -static DEVICE_ATTR_RO(per_cpu); - -static ssize_t max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); -} - -static ssize_t max_active_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int val; - - if (sscanf(buf, "%d", &val) != 1 || val <= 0) - return -EINVAL; - - workqueue_set_max_active(wq, val); - return count; -} -static DEVICE_ATTR_RW(max_active); - -static struct attribute *wq_sysfs_attrs[] = { - &dev_attr_per_cpu.attr, - &dev_attr_max_active.attr, - NULL, -}; -ATTRIBUTE_GROUPS(wq_sysfs); - -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - rcu_read_lock_sched(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); - - return written; -} - -static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); - mutex_unlock(&wq->mutex); - - return written; -} - -/* prepare workqueue_attrs for sysfs store operations */ -static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) -{ - struct workqueue_attrs *attrs; - - attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!attrs) - return NULL; - - mutex_lock(&wq->mutex); - copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); - return attrs; -} - -static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); - else - ret = -EINVAL; - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static ssize_t wq_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); - mutex_unlock(&wq->mutex); - - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - return written; -} - -static ssize_t wq_cpumask_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - ret = cpumask_parse(buf, attrs->cpumask); - if (!ret) - ret = apply_workqueue_attrs(wq, attrs); - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); - } - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), - __ATTR(nice, 0644, wq_nice_show, wq_nice_store), - __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), - __ATTR_NULL, -}; - -static struct bus_type wq_subsys = { - .name = "workqueue", - .dev_groups = wq_sysfs_groups, -}; - -static int __init wq_sysfs_init(void) -{ - return subsys_virtual_register(&wq_subsys, NULL); -} -core_initcall(wq_sysfs_init); - -static void wq_device_release(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - kfree(wq_dev); -} - -/** - * workqueue_sysfs_register - make a workqueue visible in sysfs - * @wq: the workqueue to register - * - * Expose @wq in sysfs under /sys/bus/workqueue/devices. - * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set - * which is the preferred method. - * - * Workqueue user should use this function directly iff it wants to apply - * workqueue_attrs before making the workqueue visible in sysfs; otherwise, - * apply_workqueue_attrs() may race against userland updating the - * attributes. - * - * Return: 0 on success, -errno on failure. - */ -int workqueue_sysfs_register(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev; - int ret; - - /* - * Adjusting max_active or creating new pwqs by applyting - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; - - wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); - if (!wq_dev) - return -ENOMEM; - - wq_dev->wq = wq; - wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; - wq_dev->dev.release = wq_device_release; - - /* - * unbound_attrs are created separately. Suppress uevent until - * everything is ready. - */ - dev_set_uevent_suppress(&wq_dev->dev, true); - - ret = device_register(&wq_dev->dev); - if (ret) { - kfree(wq_dev); - wq->wq_dev = NULL; - return ret; - } - - if (wq->flags & WQ_UNBOUND) { - struct device_attribute *attr; - - for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { - ret = device_create_file(&wq_dev->dev, attr); - if (ret) { - device_unregister(&wq_dev->dev); - wq->wq_dev = NULL; - return ret; - } - } - } - - dev_set_uevent_suppress(&wq_dev->dev, false); - kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); - return 0; -} - -/** - * workqueue_sysfs_unregister - undo workqueue_sysfs_register() - * @wq: the workqueue to unregister - * - * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. - */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev = wq->wq_dev; - - if (!wq->wq_dev) - return; - - wq->wq_dev = NULL; - device_unregister(&wq_dev->dev); -} -#else /* CONFIG_SYSFS */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } -#endif /* CONFIG_SYSFS */ - /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free @@ -3427,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool) return 0; } +static void rcu_free_wq(struct rcu_head *rcu) +{ + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); + + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); + else + free_workqueue_attrs(wq->unbound_attrs); + + kfree(wq->rescuer); + kfree(wq); +} + static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); @@ -3458,7 +3165,7 @@ static void put_unbound_pool(struct worker_pool *pool) return; /* sanity checks */ - if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; @@ -3524,7 +3231,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; - goto out_unlock; + return pool; } } @@ -3557,12 +3264,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); -out_unlock: + return pool; fail: if (pool) @@ -3591,11 +3298,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work) if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; - /* - * Unlink @pwq. Synchronization against wq->mutex isn't strictly - * necessary on release but do it anyway. It's easier to verify - * and consistent with the linking path. - */ mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); @@ -3609,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) /* * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Free it. + * is gonna access it anymore. Schedule RCU free. */ - if (is_last) { - free_workqueue_attrs(wq->unbound_attrs); - kfree(wq); - } + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** @@ -3692,10 +3392,7 @@ static void link_pwq(struct pool_workqueue *pwq) if (!list_empty(&pwq->pwqs_node)) return; - /* - * Set the matching work_color. This is synchronized with - * wq->mutex to avoid confusing flush_workqueue(). - */ + /* set the matching work_color */ pwq->work_color = wq->work_color; /* sync max_active to the current setting */ @@ -3832,7 +3529,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; - pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); + pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pwq_tbl || !new_attrs || !tmp_attrs) @@ -4080,7 +3777,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* allocate wq and format name */ if (flags & WQ_UNBOUND) - tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); + tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) @@ -4122,7 +3819,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - rescuer = alloc_worker(); + rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) goto err_destroy; @@ -4154,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); - list_add(&wq->list, &workqueues); + list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); @@ -4210,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); - list_del_init(&wq->list); + list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); - if (wq->rescuer) { + if (wq->rescuer) kthread_stop(wq->rescuer->task); - kfree(wq->rescuer); - wq->rescuer = NULL; - } if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly - * free the pwqs and wq. + * schedule RCU free. */ - free_percpu(wq->cpu_pwqs); - kfree(wq); + call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly @@ -4448,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) } } +static void pr_cont_pool_info(struct worker_pool *pool) +{ + pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); + if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr; + + barr = container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + pr_info(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + pr_info(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + pr_info(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + pr_info(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + /* * CPU hotplug. * @@ -4470,8 +4323,6 @@ static void wq_unbind_fn(struct work_struct *work) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); @@ -4543,6 +4394,7 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; @@ -4632,7 +4484,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; - if (create_and_start_worker(pool) < 0) + if (!create_worker(pool)) return NOTIFY_BAD; } break; @@ -4644,15 +4496,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_pool(pool, pi) { mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) { - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); - + if (pool->cpu == cpu) rebind_workers(pool); - } else if (pool->cpu < 0) { + else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - } mutex_unlock(&pool->attach_mutex); } @@ -4851,15 +4698,328 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} +static DEVICE_ATTR_RO(per_cpu); + +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} +static DEVICE_ATTR_RW(max_active); + +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, +}; +ATTRIBUTE_GROUPS(wq_sysfs); + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq->unbound_attrs->cpumask)); + mutex_unlock(&wq->mutex); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_groups = wq_sysfs_groups, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Return: 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; + + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + dev_set_uevent_suppress(&wq_dev->dev, false); + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; int node, cpu; - /* determine NUMA pwq table len - highest node id + 1 */ - for_each_node(node) - wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); - if (num_possible_nodes() <= 1) return; @@ -4876,7 +5036,7 @@ static void __init wq_numa_init(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) @@ -4936,7 +5096,7 @@ static int __init init_workqueues(void) for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(create_and_start_worker(pool) < 0); + BUG_ON(!create_worker(pool)); } } |