aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c36
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/auditfilter.c9
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/compat.c33
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c115
-rw-r--r--kernel/dma.c10
-rw-r--r--kernel/exit.c52
-rw-r--r--kernel/fork.c101
-rw-r--r--kernel/futex.c27
-rw-r--r--kernel/futex_compat.c12
-rw-r--r--kernel/hrtimer.c20
-rw-r--r--kernel/irq/chip.c146
-rw-r--r--kernel/irq/handle.c19
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/migration.c34
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c10
-rw-r--r--kernel/kallsyms.c124
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c28
-rw-r--r--kernel/kmod.c74
-rw-r--r--kernel/kprobes.c53
-rw-r--r--kernel/latency.c279
-rw-r--r--kernel/lockdep.c40
-rw-r--r--kernel/module.c92
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/nsproxy.c137
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c15
-rw-r--r--kernel/pid.c123
-rw-r--r--kernel/posix-cpu-timers.c128
-rw-r--r--kernel/posix-timers.c23
-rw-r--r--kernel/power/disk.c8
-rw-r--r--kernel/power/poweroff.c3
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/power/swap.c3
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c11
-rw-r--r--kernel/profile.c7
-rw-r--r--kernel/ptrace.c55
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcutorture.c327
-rw-r--r--kernel/relay.c79
-rw-r--r--kernel/resource.c115
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c51
-rw-r--r--kernel/sched.c424
-rw-r--r--kernel/signal.c76
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/srcu.c258
-rw-r--r--kernel/stop_machine.c3
-rw-r--r--kernel/sys.c236
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c482
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c173
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c350
-rw-r--r--kernel/timer.c283
-rw-r--r--kernel/tsacct.c124
-rw-r--r--kernel/unwind.c322
-rw-r--r--kernel/utsname.c95
-rw-r--r--kernel/workqueue.c9
72 files changed, 3875 insertions, 1520 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66c1af2..5e3f3b75563a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
@@ -48,8 +48,9 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
-obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c7..0aad5ca36a81 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
ac.ac_ppid = current->parent->tgid;
#endif
- read_lock(&tasklist_lock); /* pin current->signal */
+ mutex_lock(&tty_mutex);
+ /* FIXME: Whoever is responsible for current->signal locking needs
+ to use the same locking all over the kernel and document it */
+ read_lock(&tasklist_lock);
ac.ac_tty = current->signal->tty ?
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
+ mutex_unlock(&tty_mutex);
spin_lock_irq(&current->sighand->siglock);
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
@@ -598,33 +602,3 @@ void acct_process(void)
do_acct_process(file);
fput(file);
}
-
-
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
-{
- if (likely(tsk->mm)) {
- long delta =
- cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
-
- if (delta == 0)
- return;
- tsk->acct_stimexpd = tsk->stime;
- tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
- }
-}
-
-/**
- * acct_clear_integrals - clear the mm integral fields in task_struct
- * @tsk: task_struct whose accounting fields are cleared
- */
-void acct_clear_integrals(struct task_struct *tsk)
-{
- tsk->acct_stimexpd = 0;
- tsk->acct_rss_mem1 = 0;
- tsk->acct_vm_mem1 = 0;
-}
diff --git a/kernel/audit.c b/kernel/audit.c
index f9889ee77825..98106f6078b0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -340,7 +340,7 @@ static int kauditd_thread(void *dummy)
{
struct sk_buff *skb;
- while (1) {
+ while (!kthread_should_stop()) {
skb = skb_dequeue(&audit_skb_queue);
wake_up(&audit_backlog_wait);
if (skb) {
@@ -369,6 +369,7 @@ static int kauditd_thread(void *dummy)
remove_wait_queue(&kauditd_wait, &wait);
}
}
+ return 0;
}
int audit_send_list(void *_dest)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 1a58a81fb09d..4f40d923af8e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -411,7 +411,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_FSGID:
case AUDIT_LOGINUID:
case AUDIT_PERS:
- case AUDIT_ARCH:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
@@ -423,6 +422,14 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_ARG2:
case AUDIT_ARG3:
break;
+ /* arch is only allowed to be = or != */
+ case AUDIT_ARCH:
+ if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL)
+ && (f->op != AUDIT_NEGATE) && (f->op)) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+ break;
case AUDIT_PERM:
if (f->val & ~15)
goto exit_free;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fb83c5cb8c32..42f2f1179711 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -278,8 +278,11 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(tsk->pid, f->op, f->val);
break;
case AUDIT_PPID:
- if (ctx)
+ if (ctx) {
+ if (!ctx->ppid)
+ ctx->ppid = sys_getppid();
result = audit_comparator(ctx->ppid, f->op, f->val);
+ }
break;
case AUDIT_UID:
result = audit_comparator(tsk->uid, f->op, f->val);
@@ -795,7 +798,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
/* tsk == current */
context->pid = tsk->pid;
- context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
+ if (!context->ppid)
+ context->ppid = sys_getppid();
context->uid = tsk->uid;
context->gid = tsk->gid;
context->euid = tsk->euid;
@@ -817,6 +821,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
+
+ mutex_lock(&tty_mutex);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
@@ -838,6 +844,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->gid,
context->euid, context->suid, context->fsuid,
context->egid, context->sgid, context->fsgid, tty);
+
+ mutex_unlock(&tty_mutex);
+
audit_log_task_info(ab, tsk);
if (context->filterkey) {
audit_log_format(ab, " key=");
@@ -1132,6 +1141,7 @@ void audit_syscall_entry(int arch, int major,
context->ctime = CURRENT_TIME;
context->in_syscall = 1;
context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
+ context->ppid = 0;
}
/**
@@ -1347,7 +1357,13 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
}
update_context:
- idx = context->name_count++;
+ idx = context->name_count;
+ if (context->name_count == AUDIT_NAMES) {
+ printk(KERN_DEBUG "name_count maxed and losing %s\n",
+ found_name ?: "(null)");
+ return;
+ }
+ context->name_count++;
#if AUDIT_DEBUG
context->ino_count++;
#endif
@@ -1365,7 +1381,16 @@ update_context:
/* A parent was not found in audit_names, so copy the inode data for the
* provided parent. */
if (!found_name) {
- idx = context->name_count++;
+ idx = context->name_count;
+ if (context->name_count == AUDIT_NAMES) {
+ printk(KERN_DEBUG
+ "name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu",
+ MAJOR(parent->i_sb->s_dev),
+ MINOR(parent->i_sb->s_dev),
+ parent->i_ino);
+ return;
+ }
+ context->name_count++;
#if AUDIT_DEBUG
context->ino_count++;
#endif
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
int found = 0;
do_each_thread(g, target) {
- if (target == current || target->pid == 1)
+ if (target == current || is_init(target))
continue;
found = 1;
if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530aa..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/timex.h>
#include <linux/migrate.h>
+#include <linux/posix-timers.h>
#include <asm/uaccess.h>
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
return err;
}
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec tu;
+ struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+
+ restart->arg1 = (unsigned long) &tu;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = clock_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&tu, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
+ return err;
+}
+
long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
long err;
mm_segment_t oldfs;
struct timespec in, out;
+ struct restart_block *restart;
if (get_compat_timespec(&in, rqtp))
return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
(struct timespec __user *) &in,
(struct timespec __user *) &out);
set_fs(oldfs);
+
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
put_compat_timespec(&out, rmtp))
return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
return err;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 32c96628463e..27dd3ee47099 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,7 +19,7 @@
static DEFINE_MUTEX(cpu_add_remove_lock);
static DEFINE_MUTEX(cpu_bitmask_lock);
-static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
@@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
/* Need to know about CPUs going up/down? */
int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_register(&cpu_chain, nb);
+ int ret;
+ mutex_lock(&cpu_add_remove_lock);
+ ret = raw_notifier_chain_register(&cpu_chain, nb);
+ mutex_unlock(&cpu_add_remove_lock);
+ return ret;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
- blocking_notifier_chain_unregister(&cpu_chain, nb);
+ mutex_lock(&cpu_add_remove_lock);
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+ mutex_unlock(&cpu_add_remove_lock);
}
EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu)
if (!cpu_online(cpu))
return -EINVAL;
- err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+ err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
@@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu)
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
@@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu)
put_cpu();
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
@@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu)
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
- ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu)
BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
out_notify:
if (ret != 0)
- blocking_notifier_call_chain(&cpu_chain,
+ raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED, hcpu);
return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cff41511269f..6313c38c930e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
* A cpuset can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cpusets is empty. Since all
* tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
* always has either children cpusets and/or using tasks. So we don't
* need a special hack to ensure that top_cpuset cannot be deleted.
*
@@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
- inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -378,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directories start off with i_nlink == 2 (for "." entry) */
- inode->i_nlink++;
+ inc_nlink(inode);
} else {
return -ENOMEM;
}
@@ -913,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
int fudge;
int retval;
+ /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
@@ -1222,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
task_lock(tsk);
oldcs = tsk->cpuset;
- if (!oldcs) {
+ /*
+ * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+ * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+ * then fail this attach_task(), to avoid breaking top_cpuset.count.
+ */
+ if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
mutex_unlock(&callback_mutex);
put_task_struct(tsk);
@@ -1557,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
inode->i_fop = &simple_dir_operations;
/* start off with i_nlink == 2 (for "." entry) */
- inode->i_nlink++;
+ inc_nlink(inode);
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cpuset_file_operations;
@@ -1590,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
error = cpuset_create_file(dentry, S_IFDIR | mode);
if (!error) {
dentry->d_fsdata = cs;
- parent->d_inode->i_nlink++;
+ inc_nlink(parent->d_inode);
cs->dentry = dentry;
}
dput(dentry);
@@ -2025,7 +2033,7 @@ int __init cpuset_init(void)
}
root = cpuset_mount->mnt_sb->s_root;
root->d_fsdata = &top_cpuset;
- root->d_inode->i_nlink++;
+ inc_nlink(root->d_inode);
top_cpuset.dentry = root;
root->d_inode->i_op = &cpuset_dir_inode_operations;
number_of_cpusets = 1;
@@ -2037,33 +2045,104 @@ out:
return err;
}
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * This handles CPU hotplug (cpuhp) events. If someday Memory
- * Nodes can be hotplugged (dynamically changing node_online_map)
- * then we should handle that too, perhaps in a similar way.
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes. Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes. It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
*/
-#ifdef CONFIG_HOTPLUG_CPU
-static int cpuset_handle_cpuhp(struct notifier_block *nb,
- unsigned long phase, void *cpu)
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+ struct cpuset *c;
+
+ /* Each of our child cpusets mems must be online */
+ list_for_each_entry(c, &cur->children, sibling) {
+ guarantee_online_cpus_mems_in_subtree(c);
+ if (!cpus_empty(c->cpus_allowed))
+ guarantee_online_cpus(c, &c->cpus_allowed);
+ if (!nodes_empty(c->mems_allowed))
+ guarantee_online_mems(c, &c->mems_allowed);
+ }
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map. Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here. We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
{
mutex_lock(&manage_mutex);
mutex_lock(&callback_mutex);
+ guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_online_map;
mutex_unlock(&callback_mutex);
mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+ unsigned long phase, void *cpu)
+{
+ common_cpu_mem_hotplug_unplug();
return 0;
}
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+void cpuset_track_online_nodes(void)
+{
+ common_cpu_mem_hotplug_unplug();
+}
+#endif
+
/**
* cpuset_init_smp - initialize cpus_allowed
*
diff --git a/kernel/dma.c b/kernel/dma.c
index aef0a45b7893..2020644c938a 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
};
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
int request_dma(unsigned int dmanr, const char * device_id)
{
if (dmanr >= MAX_DMA_CHANNELS)
@@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id)
return 0;
} /* request_dma */
-
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
void free_dma(unsigned int dmanr)
{
if (dmanr >= MAX_DMA_CHANNELS) {
diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f7..f250a5e3e281 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -18,8 +18,10 @@
#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/binfmts.h>
+#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
@@ -38,6 +40,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
+#include <linux/blkdev.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -219,7 +222,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p == ignored_task
|| p->exit_state
- || p->real_parent->pid == 1)
+ || is_init(p->real_parent))
continue;
if (process_group(p->real_parent) != pgrp
&& p->real_parent->signal->session == p->signal->session) {
@@ -249,17 +252,6 @@ static int has_stopped_jobs(int pgrp)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p->state != TASK_STOPPED)
continue;
-
- /* If p is stopped by a debugger on a signal that won't
- stop it, then don't count p as stopped. This isn't
- perfect but it's a good approximation. */
- if (unlikely (p->ptrace)
- && p->exit_code != SIGSTOP
- && p->exit_code != SIGTSTP
- && p->exit_code != SIGTTOU
- && p->exit_code != SIGTTIN)
- continue;
-
retval = 1;
break;
} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -292,9 +284,7 @@ static void reparent_to_init(void)
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
- if ((current->policy == SCHED_NORMAL ||
- current->policy == SCHED_BATCH)
- && (task_nice(current) < 0))
+ if (!has_rt_policy(current) && (task_nice(current) < 0))
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
@@ -408,9 +398,11 @@ void daemonize(const char *name, ...)
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
- exit_namespace(current);
- current->namespace = init_task.namespace;
- get_namespace(current->namespace);
+
+ exit_task_namespaces(current);
+ current->nsproxy = init_task.nsproxy;
+ get_task_namespaces(current);
+
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@@ -487,6 +479,18 @@ void fastcall put_files_struct(struct files_struct *files)
EXPORT_SYMBOL(put_files_struct);
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+ struct files_struct *old;
+
+ old = tsk->files;
+ task_lock(tsk);
+ tsk->files = files;
+ task_unlock(tsk);
+ put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
+
static inline void __exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -916,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code)
exit_sem(tsk);
__exit_files(tsk);
__exit_fs(tsk);
- exit_namespace(tsk);
exit_thread();
cpuset_exit(tsk);
exit_keys(tsk);
@@ -931,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code)
tsk->exit_code = code;
proc_exit_connector(tsk);
exit_notify(tsk);
+ exit_task_namespaces(tsk);
#ifdef CONFIG_NUMA
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
@@ -954,15 +958,15 @@ fastcall NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
- /* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
- BUG_ON(tsk->flags & PF_DEAD);
- tsk->flags |= PF_DEAD;
+ /* causes final put_task_struct in finish_task_switch(). */
+ tsk->state = TASK_DEAD;
schedule();
BUG();
/* Avoid "noreturn function does return". */
- for (;;) ;
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -971,7 +975,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
-
+
do_exit(code);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index a0dad84567c9..29ebb30850ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/fs.h>
+#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -42,6 +43,7 @@
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
@@ -183,7 +185,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
+#endif
tsk->splice_pipe = NULL;
return tsk;
}
@@ -980,6 +984,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!p)
goto fork_out;
+ rt_mutex_init_task(p);
+
#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
@@ -1061,7 +1067,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ p->hardirqs_enabled = 1;
+#else
p->hardirqs_enabled = 0;
+#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
@@ -1080,8 +1090,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->lockdep_recursion = 0;
#endif
- rt_mutex_init_task(p);
-
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
@@ -1109,11 +1117,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
- if ((retval = copy_namespace(clone_flags, p)))
+ if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
- goto bad_fork_cleanup_namespace;
+ goto bad_fork_cleanup_namespaces;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
@@ -1144,7 +1152,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
-
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
@@ -1167,6 +1174,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
+ /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+ p->ioprio = current->ioprio;
+
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
@@ -1203,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_cleanup_namespace;
+ goto bad_fork_cleanup_namespaces;
}
if (clone_flags & CLONE_THREAD) {
@@ -1226,11 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
}
- /*
- * inherit ioprio
- */
- p->ioprio = current->ioprio;
-
if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
@@ -1256,8 +1261,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
proc_fork_connector(p);
return p;
-bad_fork_cleanup_namespace:
- exit_namespace(p);
+bad_fork_cleanup_namespaces:
+ exit_task_namespaces(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
@@ -1510,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
*/
static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
{
- struct namespace *ns = current->namespace;
+ struct namespace *ns = current->nsproxy->namespace;
- if ((unshare_flags & CLONE_NEWNS) &&
- (ns && atomic_read(&ns->count) > 1)) {
+ if ((unshare_flags & CLONE_NEWNS) && ns) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1585,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
return 0;
}
+#ifndef CONFIG_IPC_NS
+static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
+{
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
@@ -1602,13 +1616,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
+ struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
+ struct uts_namespace *uts, *new_uts = NULL;
+ struct ipc_namespace *ipc, *new_ipc = NULL;
check_unshare_flags(&unshare_flags);
/* Return -EINVAL for all unsupported flags */
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC))
goto bad_unshare_out;
if ((err = unshare_thread(unshare_flags)))
@@ -1625,11 +1643,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
+ if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ goto bad_unshare_cleanup_semundo;
+ if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
+ goto bad_unshare_cleanup_uts;
+
+ if (new_ns || new_uts || new_ipc) {
+ old_nsproxy = current->nsproxy;
+ new_nsproxy = dup_namespaces(old_nsproxy);
+ if (!new_nsproxy) {
+ err = -ENOMEM;
+ goto bad_unshare_cleanup_ipc;
+ }
+ }
- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+ if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ new_uts || new_ipc) {
task_lock(current);
+ if (new_nsproxy) {
+ current->nsproxy = new_nsproxy;
+ new_nsproxy = old_nsproxy;
+ }
+
if (new_fs) {
fs = current->fs;
current->fs = new_fs;
@@ -1637,8 +1674,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
}
if (new_ns) {
- ns = current->namespace;
- current->namespace = new_ns;
+ ns = current->nsproxy->namespace;
+ current->nsproxy->namespace = new_ns;
new_ns = ns;
}
@@ -1663,9 +1700,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fd = fd;
}
+ if (new_uts) {
+ uts = current->nsproxy->uts_ns;
+ current->nsproxy->uts_ns = new_uts;
+ new_uts = uts;
+ }
+
+ if (new_ipc) {
+ ipc = current->nsproxy->ipc_ns;
+ current->nsproxy->ipc_ns = new_ipc;
+ new_ipc = ipc;
+ }
+
task_unlock(current);
}
+ if (new_nsproxy)
+ put_nsproxy(new_nsproxy);
+
+bad_unshare_cleanup_ipc:
+ if (new_ipc)
+ put_ipc_ns(new_ipc);
+
+bad_unshare_cleanup_uts:
+ if (new_uts)
+ put_uts_ns(new_uts);
+
+bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cff..b364e0026191 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
}
get_task_struct(p);
out_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return p;
}
@@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
if (signal) {
- err = f_setown(filp, current->pid, 1);
+ err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
if (err < 0) {
goto error;
}
@@ -1612,10 +1612,10 @@ sys_set_robust_list(struct robust_list_head __user *head,
* @len_ptr: pointer to a length field, the kernel fills in the header size
*/
asmlinkage long
-sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
size_t __user *len_ptr)
{
- struct robust_list_head *head;
+ struct robust_list_head __user *head;
unsigned long ret;
if (!pid)
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
struct task_struct *p;
ret = -ESRCH;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
!capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->robust_list;
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}
if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
return put_user(head, head_ptr);
err_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1694,14 +1694,15 @@ retry:
* Fetch a robust-list pointer. Bit 0 signals PI futexes:
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
- struct robust_list __user **head, int *pi)
+ struct robust_list __user * __user *head,
+ int *pi)
{
unsigned long uentry;
- if (get_user(uentry, (unsigned long *)head))
+ if (get_user(uentry, (unsigned long __user *)head))
return -EFAULT;
- *entry = (void *)(uentry & ~1UL);
+ *entry = (void __user *)(uentry & ~1UL);
*pi = uentry & 1;
return 0;
@@ -1739,7 +1740,7 @@ void exit_robust_list(struct task_struct *curr)
return;
if (pending)
- handle_futex_death((void *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset, curr, pip);
while (entry != &head->list) {
/*
@@ -1747,7 +1748,7 @@ void exit_robust_list(struct task_struct *curr)
* don't process it twice:
*/
if (entry != pending)
- if (handle_futex_death((void *)entry + futex_offset,
+ if (handle_futex_death((void __user *)entry + futex_offset,
curr, pi))
return;
/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c5cca3f65cb7..50f24eea6cd0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -18,7 +18,7 @@
*/
static inline int
fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t *head, int *pi)
+ compat_uptr_t __user *head, int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
@@ -62,7 +62,7 @@ void compat_exit_robust_list(struct task_struct *curr)
&head->list_op_pending, &pip))
return;
if (upending)
- handle_futex_death((void *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset, curr, pip);
while (compat_ptr(uentry) != &head->list) {
/*
@@ -70,7 +70,7 @@ void compat_exit_robust_list(struct task_struct *curr)
* dont process it twice:
*/
if (entry != pending)
- if (handle_futex_death((void *)entry + futex_offset,
+ if (handle_futex_death((void __user *)entry + futex_offset,
curr, pi))
return;
@@ -78,7 +78,7 @@ void compat_exit_robust_list(struct task_struct *curr)
* Fetch the next entry in the list:
*/
if (fetch_robust_entry(&uentry, &entry,
- (compat_uptr_t *)&entry->next, &pi))
+ (compat_uptr_t __user *)&entry->next, &pi))
return;
/*
* Avoid excessively long or circular lists:
@@ -103,10 +103,10 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
}
asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
compat_size_t __user *len_ptr)
{
- struct compat_robust_list_head *head;
+ struct compat_robust_list_head __user *head;
unsigned long ret;
if (!pid)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666b..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
return t->task == NULL;
}
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
struct hrtimer_sleeper t;
struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
- hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
- t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+ hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+ t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
if (do_nanosleep(&t, HRTIMER_ABS))
return 0;
- rmtp = (struct timespec __user *) restart->arg2;
+ rmtp = (struct timespec __user *) restart->arg1;
if (rmtp) {
time = ktime_sub(t.timer.expires, t.timer.base->get_time());
if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
return -EFAULT;
}
- restart->fn = nanosleep_restart;
+ restart->fn = hrtimer_nanosleep_restart;
/* The other values in restart are already filled in */
return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
}
restart = &current_thread_info()->restart_block;
- restart->fn = nanosleep_restart;
- restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg1 = t.timer.expires.tv64 >> 32;
- restart->arg2 = (unsigned long) rmtp;
- restart->arg3 = (unsigned long) t.timer.base->index;
+ restart->fn = hrtimer_nanosleep_restart;
+ restart->arg0 = (unsigned long) t.timer.base->index;
+ restart->arg1 = (unsigned long) rmtp;
+ restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
+ restart->arg3 = t.timer.expires.tv64 >> 32;
return -ERESTART_RESTARTBLOCK;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d4937..2d0dc3efe813 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,6 +18,69 @@
#include "internals.h"
/**
+ * dynamic_irq_init - initialize a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_init(unsigned int irq)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+ WARN_ON(1);
+ return;
+ }
+
+ /* Ensure we don't have left over values from a previous use of this irq */
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status = IRQ_DISABLED;
+ desc->chip = &no_irq_chip;
+ desc->handle_irq = handle_bad_irq;
+ desc->depth = 1;
+ desc->handler_data = NULL;
+ desc->chip_data = NULL;
+ desc->action = NULL;
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+#ifdef CONFIG_SMP
+ desc->affinity = CPU_MASK_ALL;
+#endif
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+ WARN_ON(1);
+ return;
+ }
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock, flags);
+ if (desc->action) {
+ spin_unlock_irqrestore(&desc->lock, flags);
+ printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+ irq);
+ WARN_ON(1);
+ return;
+ }
+ desc->handle_irq = handle_bad_irq;
+ desc->chip = &no_irq_chip;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+
+/**
* set_irq_chip - set the irq chip for an irq
* @irq: irq number
* @chip: pointer to irq chip description structure
@@ -40,10 +103,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
spin_lock_irqsave(&desc->lock, flags);
irq_chip_set_defaults(chip);
desc->chip = chip;
- /*
- * For compatibility only:
- */
- desc->chip = chip;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -146,7 +205,7 @@ static void default_disable(unsigned int irq)
struct irq_desc *desc = irq_desc + irq;
if (!(desc->status & IRQ_DELAYED_DISABLE))
- irq_desc[irq].chip->mask(irq);
+ desc->chip->mask(irq);
}
/*
@@ -190,7 +249,6 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
- * @regs: pointer to a register structure
*
* Simple interrupts are either sent from a demultiplexing interrupt
* handler or come from hardware, where no interrupt hardware control
@@ -200,7 +258,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
* unmask issues if necessary.
*/
void fastcall
-handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
struct irqaction *action;
irqreturn_t action_ret;
@@ -220,9 +278,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
desc->status |= IRQ_INPROGRESS;
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, regs, action);
+ action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
@@ -234,7 +292,6 @@ out_unlock:
* handle_level_irq - Level type irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
- * @regs: pointer to a register structure
*
* Level type interrupts are active as long as the hardware line has
* the active level. This may require to mask the interrupt and unmask
@@ -242,7 +299,7 @@ out_unlock:
* interrupt line is back to inactive.
*/
void fastcall
-handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
unsigned int cpu = smp_processor_id();
struct irqaction *action;
@@ -270,9 +327,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, regs, action);
+ action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
@@ -286,7 +343,6 @@ out_unlock:
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
- * @regs: pointer to a register structure
*
* Only a single callback will be issued to the chip: an ->eoi()
* call when the interrupt has been serviced. This enables support
@@ -294,8 +350,7 @@ out_unlock:
* details in hardware, transparently.
*/
void fastcall
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
- struct pt_regs *regs)
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
unsigned int cpu = smp_processor_id();
struct irqaction *action;
@@ -323,9 +378,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, regs, action);
+ action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
@@ -339,7 +394,6 @@ out:
* handle_edge_irq - edge type IRQ handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
- * @regs: pointer to a register structure
*
* Interrupt occures on the falling and/or rising edge of a hardware
* signal. The occurence is latched into the irq controller hardware
@@ -353,7 +407,7 @@ out:
* loop is left.
*/
void fastcall
-handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
const unsigned int cpu = smp_processor_id();
@@ -404,9 +458,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, regs, action);
+ action_ret = handle_IRQ_event(irq, action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
spin_lock(&desc->lock);
} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
@@ -421,12 +475,11 @@ out_unlock:
* handle_percpu_IRQ - Per CPU local irq handler
* @irq: the interrupt number
* @desc: the interrupt description structure for this irq
- * @regs: pointer to a register structure
*
* Per CPU interrupts on SMP machines without locking requirements
*/
void fastcall
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
irqreturn_t action_ret;
@@ -435,9 +488,9 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
if (desc->chip->ack)
desc->chip->ack(irq);
- action_ret = handle_IRQ_event(irq, regs, desc->action);
+ action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
if (desc->chip->eoi)
desc->chip->eoi(irq);
@@ -446,10 +499,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
#endif /* CONFIG_SMP */
void
-__set_irq_handler(unsigned int irq,
- void fastcall (*handle)(unsigned int, irq_desc_t *,
- struct pt_regs *),
- int is_chained)
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
{
struct irq_desc *desc;
unsigned long flags;
@@ -490,6 +541,7 @@ __set_irq_handler(unsigned int irq,
desc->depth = 1;
}
desc->handle_irq = handle;
+ desc->name = name;
if (handle != handle_bad_irq && is_chained) {
desc->status &= ~IRQ_DISABLED;
@@ -502,36 +554,16 @@ __set_irq_handler(unsigned int irq,
void
set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
- void fastcall (*handle)(unsigned int,
- struct irq_desc *,
- struct pt_regs *))
+ irq_flow_handler_t handle)
{
set_irq_chip(irq, chip);
- __set_irq_handler(irq, handle, 0);
+ __set_irq_handler(irq, handle, 0, NULL);
}
-/*
- * Get a descriptive string for the highlevel handler, for
- * /proc/interrupts output:
- */
-const char *
-handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
- struct pt_regs *))
+void
+set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+ irq_flow_handler_t handle, const char *name)
{
- if (handle == handle_level_irq)
- return "level ";
- if (handle == handle_fasteoi_irq)
- return "fasteoi";
- if (handle == handle_edge_irq)
- return "edge ";
- if (handle == handle_simple_irq)
- return "simple ";
-#ifdef CONFIG_SMP
- if (handle == handle_percpu_irq)
- return "percpu ";
-#endif
- if (handle == handle_bad_irq)
- return "bad ";
-
- return NULL;
+ set_irq_chip(irq, chip);
+ __set_irq_handler(irq, handle, 0, name);
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4c6cdbaed661..42aa6f1a3f0f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -27,7 +27,7 @@
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
void fastcall
-handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
kstat_this_cpu.irqs[irq]++;
@@ -115,7 +115,7 @@ struct irq_chip dummy_irq_chip = {
/*
* Special, empty irq handler:
*/
-irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+irqreturn_t no_action(int cpl, void *dev_id)
{
return IRQ_NONE;
}
@@ -123,13 +123,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
- * @regs: pointer to a register structure
* @action: the interrupt action chain for this irq
*
* Handles the action chain of an irq event
*/
-irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
- struct irqaction *action)
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
{
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
@@ -140,7 +138,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
local_irq_enable_in_hardirq();
do {
- ret = action->handler(irq, action->dev_id, regs);
+ ret = action->handler(irq, action->dev_id);
if (ret == IRQ_HANDLED)
status |= action->flags;
retval |= ret;
@@ -158,7 +156,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
/**
* __do_IRQ - original all in one highlevel IRQ handler
* @irq: the interrupt number
- * @regs: pointer to a register structure
*
* __do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
@@ -167,7 +164,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
* This is the original x86 implementation which is used for every
* interrupt type.
*/
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall unsigned int __do_IRQ(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
@@ -182,7 +179,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
*/
if (desc->chip->ack)
desc->chip->ack(irq);
- action_ret = handle_IRQ_event(irq, regs, desc->action);
+ action_ret = handle_IRQ_event(irq, desc->action);
desc->chip->end(irq);
return 1;
}
@@ -233,11 +230,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, regs, action);
+ action_ret = handle_IRQ_event(irq, action);
spin_lock(&desc->lock);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret, regs);
+ note_interrupt(irq, desc, action_ret);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 92be519eff26..6879202afe9a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -427,8 +427,7 @@ EXPORT_SYMBOL(free_irq);
* IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
*
*/
-int request_irq(unsigned int irq,
- irqreturn_t (*handler)(int, void *, struct pt_regs *),
+int request_irq(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
{
struct irqaction *action;
@@ -475,4 +474,3 @@ int request_irq(unsigned int irq,
return retval;
}
EXPORT_SYMBOL(request_irq);
-
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a57ebe9fa6f6..4baa3bbcd25a 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,17 +7,17 @@ void set_pending_irq(unsigned int irq, cpumask_t mask)
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
- desc->move_irq = 1;
+ desc->status |= IRQ_MOVE_PENDING;
irq_desc[irq].pending_mask = mask;
spin_unlock_irqrestore(&desc->lock, flags);
}
-void move_native_irq(int irq)
+void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_desc + irq;
cpumask_t tmp;
- if (likely(!desc->move_irq))
+ if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
/*
@@ -28,7 +28,7 @@ void move_native_irq(int irq)
return;
}
- desc->move_irq = 0;
+ desc->status &= ~IRQ_MOVE_PENDING;
if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
return;
@@ -48,15 +48,29 @@ void move_native_irq(int irq)
* when an active trigger is comming in. This could
* cause some ioapics to mal-function.
* Being paranoid i guess!
+ *
+ * For correct operation this depends on the caller
+ * masking the irqs.
*/
if (likely(!cpus_empty(tmp))) {
- if (likely(!(desc->status & IRQ_DISABLED)))
- desc->chip->disable(irq);
-
desc->chip->set_affinity(irq,tmp);
-
- if (likely(!(desc->status & IRQ_DISABLED)))
- desc->chip->enable(irq);
}
cpus_clear(irq_desc[irq].pending_mask);
}
+
+void move_native_irq(int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ return;
+
+ if (likely(!(desc->status & IRQ_DISABLED)))
+ desc->chip->disable(irq);
+
+ move_masked_irq(irq);
+
+ if (likely(!(desc->status & IRQ_DISABLED)))
+ desc->chip->enable(irq);
+}
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 607c7809ad01..9a352667007c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -57,7 +57,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
return -EIO;
- err = cpumask_parse(buffer, count, new_value);
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
return err;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 35f10f7ff94a..5bfeaed7e487 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
clear_bit(irq, irqs_resend);
desc = irq_desc + irq;
local_irq_disable();
- desc->handle_irq(irq, desc, NULL);
+ desc->handle_irq(irq, desc);
local_irq_enable();
}
}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 417e98092cf2..543ea2e5ad93 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -16,7 +16,7 @@ static int irqfixup __read_mostly;
/*
* Recovery handler for misrouted interrupts.
*/
-static int misrouted_irq(int irq, struct pt_regs *regs)
+static int misrouted_irq(int irq)
{
int i;
int ok = 0;
@@ -49,7 +49,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
while (action) {
/* Only shared IRQ handlers are safe to call */
if (action->flags & IRQF_SHARED) {
- if (action->handler(i, action->dev_id, regs) ==
+ if (action->handler(i, action->dev_id) ==
IRQ_HANDLED)
ok = 1;
}
@@ -70,7 +70,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
*/
work = 1;
spin_unlock(&desc->lock);
- handle_IRQ_event(i, regs, action);
+ handle_IRQ_event(i, action);
spin_lock(&desc->lock);
desc->status &= ~IRQ_PENDING;
}
@@ -136,7 +136,7 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
}
void note_interrupt(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret, struct pt_regs *regs)
+ irqreturn_t action_ret)
{
if (unlikely(action_ret != IRQ_HANDLED)) {
desc->irqs_unhandled++;
@@ -147,7 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
if (unlikely(irqfixup)) {
/* Don't punish working computers */
if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
- int ok = misrouted_irq(irq, regs);
+ int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab16a5a4cfe9..eeac3e313b2b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr)
return in_gate_area_no_task(addr);
}
+static int is_ksym_addr(unsigned long addr)
+{
+ if (all_var)
+ return is_kernel(addr);
+
+ return is_kernel_text(addr) || is_kernel_inittext(addr) ||
+ is_kernel_extratext(addr);
+}
+
/* expand a compressed symbol data into the resulting uncompressed string,
given the offset to where the symbol is in the compressed stream */
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
@@ -154,7 +163,73 @@ unsigned long kallsyms_lookup_name(const char *name)
}
return module_kallsyms_lookup_name(name);
}
-EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
+
+static unsigned long get_symbol_pos(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ unsigned long symbol_start = 0, symbol_end = 0;
+ unsigned long i, low, high, mid;
+
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
+ /* do a binary search on the sorted kallsyms_addresses array */
+ low = 0;
+ high = kallsyms_num_syms;
+
+ while (high - low > 1) {
+ mid = (low + high) / 2;
+ if (kallsyms_addresses[mid] <= addr)
+ low = mid;
+ else
+ high = mid;
+ }
+
+ /*
+ * search for the first aliased symbol. Aliased
+ * symbols are symbols with the same address
+ */
+ while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ --low;
+
+ symbol_start = kallsyms_addresses[low];
+
+ /* Search for next non-aliased symbol */
+ for (i = low + 1; i < kallsyms_num_syms; i++) {
+ if (kallsyms_addresses[i] > symbol_start) {
+ symbol_end = kallsyms_addresses[i];
+ break;
+ }
+ }
+
+ /* if we found no next symbol, we use the end of the section */
+ if (!symbol_end) {
+ if (is_kernel_inittext(addr))
+ symbol_end = (unsigned long)_einittext;
+ else if (all_var)
+ symbol_end = (unsigned long)_end;
+ else
+ symbol_end = (unsigned long)_etext;
+ }
+
+ *symbolsize = symbol_end - symbol_start;
+ *offset = addr - symbol_start;
+
+ return low;
+}
+
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ if (is_ksym_addr(addr))
+ return !!get_symbol_pos(addr, symbolsize, offset);
+
+ return !!module_address_lookup(addr, symbolsize, offset, NULL);
+}
/*
* Lookup an address
@@ -168,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr,
unsigned long *offset,
char **modname, char *namebuf)
{
- unsigned long i, low, high, mid;
const char *msym;
- /* This kernel should never had been booted. */
- BUG_ON(!kallsyms_addresses);
-
namebuf[KSYM_NAME_LEN] = 0;
namebuf[0] = 0;
- if ((all_var && is_kernel(addr)) ||
- (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
- is_kernel_extratext(addr)))) {
- unsigned long symbol_end = 0;
-
- /* do a binary search on the sorted kallsyms_addresses array */
- low = 0;
- high = kallsyms_num_syms;
-
- while (high-low > 1) {
- mid = (low + high) / 2;
- if (kallsyms_addresses[mid] <= addr) low = mid;
- else high = mid;
- }
-
- /* search for the first aliased symbol. Aliased symbols are
- symbols with the same address */
- while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
- --low;
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+ pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
-
- /* Search for next non-aliased symbol */
- for (i = low + 1; i < kallsyms_num_syms; i++) {
- if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
- symbol_end = kallsyms_addresses[i];
- break;
- }
- }
-
- /* if we found no next symbol, we use the end of the section */
- if (!symbol_end) {
- if (is_kernel_inittext(addr))
- symbol_end = (unsigned long)_einittext;
- else
- symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
- }
-
- *symbolsize = symbol_end - kallsyms_addresses[low];
+ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
*modname = NULL;
- *offset = addr - kallsyms_addresses[low];
return namebuf;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
int kexec_should_crash(struct task_struct *p)
{
- if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+ if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
return 1;
return 0;
}
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
- xchg(&kexec_lock, 0); /* Release the mutex */
+ locked = xchg(&kexec_lock, 0); /* Release the mutex */
+ BUG_ON(!locked);
kimage_free(image);
return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- xchg(&kexec_lock, 0);
+ locked = xchg(&kexec_lock, 0);
+ BUG_ON(!locked);
}
}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
len = min(len, fifo->size - fifo->in + fifo->out);
+ /*
+ * Ensure that we sample the fifo->out index -before- we
+ * start putting bytes into the kfifo.
+ */
+
+ smp_mb();
+
/* first put the data starting from fifo->in to buffer end */
l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
/* then put the rest (if any) at the beginning of the buffer */
memcpy(fifo->buffer, buffer + l, len - l);
+ /*
+ * Ensure that we add the bytes to the kfifo -before-
+ * we update the fifo->in index.
+ */
+
+ smp_wmb();
+
fifo->in += len;
return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
len = min(len, fifo->in - fifo->out);
+ /*
+ * Ensure that we sample the fifo->in index -before- we
+ * start removing bytes from the kfifo.
+ */
+
+ smp_rmb();
+
/* first get the data from fifo->out until the end of the buffer */
l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
/* then get the rest (if any) from the beginning of the buffer */
memcpy(buffer + l, fifo->buffer, len - l);
+ /*
+ * Ensure that we remove the bytes from the kfifo -before-
+ * we update the fifo->out index.
+ */
+
+ smp_mb();
+
fifo->out += len;
return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb57..bb4e29d924e4 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -18,8 +18,6 @@
call_usermodehelper wait flag, and remove exec_usermodehelper.
Rusty Russell <rusty@rustcorp.com.au> Jan 2003
*/
-#define __KERNEL_SYSCALLS__
-
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
@@ -35,6 +33,7 @@
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/resource.h>
#include <asm/uaccess.h>
extern int max_threads;
@@ -122,6 +121,7 @@ struct subprocess_info {
struct key *ring;
int wait;
int retval;
+ struct file *stdin;
};
/*
@@ -145,12 +145,30 @@ static int ____call_usermodehelper(void *data)
key_put(old_session);
+ /* Install input pipe when needed */
+ if (sub_info->stdin) {
+ struct files_struct *f = current->files;
+ struct fdtable *fdt;
+ /* no races because files should be private here */
+ sys_close(0);
+ fd_install(0, sub_info->stdin);
+ spin_lock(&f->file_lock);
+ fdt = files_fdtable(f);
+ FD_SET(0, fdt->open_fds);
+ FD_CLR(0, fdt->close_on_exec);
+ spin_unlock(&f->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
+ }
+
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
retval = -EPERM;
if (current->fs->root)
- retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+ retval = kernel_execve(sub_info->path,
+ sub_info->argv, sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
@@ -176,6 +194,8 @@ static int wait_for_helper(void *data)
if (pid < 0) {
sub_info->retval = pid;
} else {
+ int ret;
+
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
@@ -185,7 +205,15 @@ static int wait_for_helper(void *data)
*
* Thus the __user pointer cast is valid here.
*/
- sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either ____call_usermodehelper failed and the
+ * real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
}
complete(sub_info->complete);
@@ -258,6 +286,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
}
EXPORT_SYMBOL(call_usermodehelper_keys);
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+ struct file **filp)
+{
+ DECLARE_COMPLETION(done);
+ struct subprocess_info sub_info = {
+ .complete = &done,
+ .path = path,
+ .argv = argv,
+ .envp = envp,
+ .retval = 0,
+ };
+ struct file *f;
+ DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+ if (!khelper_wq)
+ return -EBUSY;
+
+ if (path[0] == '\0')
+ return 0;
+
+ f = create_write_pipe();
+ if (!f)
+ return -ENOMEM;
+ *filp = f;
+
+ f = create_read_pipe(f);
+ if (!f) {
+ free_write_pipe(*filp);
+ return -ENOMEM;
+ }
+ sub_info.stdin = f;
+
+ queue_work(khelper_wq, &work);
+ wait_for_completion(&done);
+ return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3f57dfdc8f92..610c837ad9e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
+#include <linux/kallsyms.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
@@ -45,6 +46,16 @@
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+/*
+ * Some oddball architectures like 64bit powerpc have function descriptors
+ * so this must be overridable.
+ */
+#ifndef kprobe_lookup_name
+#define kprobe_lookup_name(name, addr) \
+ addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#endif
+
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
static atomic_t kprobe_count;
@@ -308,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
}
/* Called with kretprobe_lock held */
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
{
/* remove rp inst off the rprobe_inst_table */
hlist_del(&ri->hlist);
@@ -320,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
hlist_add_head(&ri->uflist, &ri->rp->free_instances);
} else
/* Unregistering */
- kfree(ri);
+ hlist_add_head(&ri->hlist, head);
}
struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
@@ -336,18 +348,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
*/
void __kprobes kprobe_flush_task(struct task_struct *tk)
{
- struct kretprobe_instance *ri;
- struct hlist_head *head;
+ struct kretprobe_instance *ri;
+ struct hlist_head *head, empty_rp;
struct hlist_node *node, *tmp;
unsigned long flags = 0;
+ INIT_HLIST_HEAD(&empty_rp);
spin_lock_irqsave(&kretprobe_lock, flags);
- head = kretprobe_inst_table_head(tk);
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
- if (ri->task == tk)
- recycle_rp_inst(ri);
- }
+ head = kretprobe_inst_table_head(tk);
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task == tk)
+ recycle_rp_inst(ri, &empty_rp);
+ }
spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+ hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
}
static inline void free_rp_inst(struct kretprobe *rp)
@@ -447,6 +465,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
struct kprobe *old_p;
struct module *probed_mod;
+ /*
+ * If we have a symbol_name argument look it up,
+ * and add it to the address. That way the addr
+ * field can either be global or relative to a symbol.
+ */
+ if (p->symbol_name) {
+ if (p->addr)
+ return -EINVAL;
+ kprobe_lookup_name(p->symbol_name, p->addr);
+ }
+
+ if (!p->addr)
+ return -EINVAL;
+ p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+
if ((!kernel_text_address((unsigned long) p->addr)) ||
in_kprobes_functions((unsigned long) p->addr))
return -EINVAL;
@@ -488,7 +521,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
(ARCH_INACTIVE_KPROBE_COUNT + 1))
register_page_fault_notifier(&kprobe_page_fault_nb);
- arch_arm_kprobe(p);
+ arch_arm_kprobe(p);
out:
mutex_unlock(&kprobe_mutex);
diff --git a/kernel/latency.c b/kernel/latency.c
new file mode 100644
index 000000000000..258f2555abbc
--- /dev/null
+++ b/kernel/latency.c
@@ -0,0 +1,279 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+ struct list_head list;
+ int usecs;
+ char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+ int min = INFINITE_LATENCY;
+ struct latency_info *info;
+
+ list_for_each_entry(info, &latency_list, list) {
+ if (info->usecs < min)
+ min = info->usecs;
+ }
+ return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+ struct latency_info *info, *iter;
+ unsigned long flags;
+ int found_old = 0;
+
+ info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+ if (!info)
+ return;
+ info->usecs = usecs;
+ info->identifier = kstrdup(identifier, GFP_KERNEL);
+ if (!info->identifier)
+ goto free_info;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ list_for_each_entry(iter, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier)==0) {
+ found_old = 1;
+ iter->usecs = usecs;
+ break;
+ }
+ }
+ if (!found_old)
+ list_add(&info->list, &latency_list);
+
+ if (usecs < atomic_read(&current_max_latency))
+ atomic_set(&current_max_latency, usecs);
+
+ spin_unlock_irqrestore(&latency_lock, flags);
+
+ blocking_notifier_call_chain(&latency_notifier,
+ atomic_read(&current_max_latency), NULL);
+
+ /*
+ * if we inserted the new one, we're done; otherwise there was
+ * an existing one so we need to free the redundant data
+ */
+ if (!found_old)
+ return;
+
+ kfree(info->identifier);
+free_info:
+ kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+ struct latency_info *iter;
+ unsigned long flags;
+
+ spin_lock_irqsave(&latency_lock, flags);
+ list_for_each_entry(iter, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier) == 0) {
+ iter->usecs = usecs;
+ break;
+ }
+ }
+ if (usecs < atomic_read(&current_max_latency))
+ atomic_set(&current_max_latency, usecs);
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+void remove_acceptable_latency(char *identifier)
+{
+ unsigned long flags;
+ int newmax = 0;
+ struct latency_info *iter, *temp;
+
+ spin_lock_irqsave(&latency_lock, flags);
+
+ list_for_each_entry_safe(iter, temp, &latency_list, list) {
+ if (strcmp(iter->identifier, identifier) == 0) {
+ list_del(&iter->list);
+ newmax = iter->usecs;
+ kfree(iter->identifier);
+ kfree(iter);
+ break;
+ }
+ }
+
+ /* If we just deleted the system wide value, we need to
+ * recalculate with a full search
+ */
+ if (newmax == atomic_read(&current_max_latency)) {
+ newmax = __find_max_latency();
+ atomic_set(&current_max_latency, newmax);
+ }
+ spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+ return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+ blocking_notifier_call_chain(&latency_notifier,
+ atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+ return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+ atomic_set(&current_max_latency, INFINITE_LATENCY);
+ /*
+ * we don't want by default to have longer latencies than 2 ticks,
+ * since that would cause lost ticks
+ */
+ set_acceptable_latency("kernel", 2*1000000/HZ);
+ return 0;
+}
+
+module_init(latency_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c088e5542e84..b739be2a6dc9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
#include <linux/stacktrace.h>
#include <linux/debug_locks.h>
#include <linux/irqflags.h>
+#include <linux/utsname.h>
#include <asm/sections.h>
@@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
* unique.
*/
#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
(key2))
void lockdep_off(void)
@@ -515,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
return 0;
}
+static void print_kernel_version(void)
+{
+ printk("%s %.*s\n", init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+}
+
/*
* When a circular dependency is detected, print the
* header first:
@@ -531,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
printk("\n=======================================================\n");
printk( "[ INFO: possible circular locking dependency detected ]\n");
+ print_kernel_version();
printk( "-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, curr->pid);
@@ -566,6 +575,8 @@ static noinline int print_circular_bug_tail(void)
return 0;
}
+#define RECURSION_LIMIT 40
+
static int noinline print_infinite_recursion_bug(void)
{
__raw_spin_unlock(&hash_lock);
@@ -586,7 +597,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
debug_atomic_inc(&nr_cyclic_check_recursions);
if (depth > max_recursion_depth)
max_recursion_depth = depth;
- if (depth >= 20)
+ if (depth >= RECURSION_LIMIT)
return print_infinite_recursion_bug();
/*
* Check this lock's dependency list:
@@ -636,7 +647,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
if (depth > max_recursion_depth)
max_recursion_depth = depth;
- if (depth >= 20)
+ if (depth >= RECURSION_LIMIT)
return print_infinite_recursion_bug();
debug_atomic_inc(&nr_find_usage_forwards_checks);
@@ -675,7 +686,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
if (depth > max_recursion_depth)
max_recursion_depth = depth;
- if (depth >= 20)
+ if (depth >= RECURSION_LIMIT)
return print_infinite_recursion_bug();
debug_atomic_inc(&nr_find_usage_backwards_checks);
@@ -712,6 +723,7 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("\n======================================================\n");
printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
irqclass, irqclass);
+ print_kernel_version();
printk( "------------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, curr->pid,
@@ -793,6 +805,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
printk("\n=============================================\n");
printk( "[ INFO: possible recursive locking detected ]\n");
+ print_kernel_version();
printk( "---------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, curr->pid);
@@ -1103,8 +1116,6 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
-extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
-
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -1142,8 +1153,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
* (or spin_lock_init()) call - which acts as the key. For static
* locks we use the lock object itself as the key.
*/
- if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
- __error_too_big_MAX_LOCKDEP_SUBCLASSES();
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
key = lock->key->subkeys + subclass;
@@ -1166,7 +1176,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
* itself, so actual lookup of the hash should be once per lock object.
*/
static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
struct list_head *hash_head;
@@ -1238,7 +1248,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
out_unlock_set:
__raw_spin_unlock(&hash_lock);
- if (!subclass)
+ if (!subclass || force)
lock->class_cache = class;
DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
@@ -1375,6 +1385,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
printk("\n=========================================================\n");
printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ print_kernel_version();
printk( "---------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, curr->pid);
@@ -1469,6 +1480,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
printk("\n=================================\n");
printk( "[ INFO: inconsistent lock state ]\n");
+ print_kernel_version();
printk( "---------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
@@ -1924,7 +1936,7 @@ void trace_softirqs_off(unsigned long ip)
* Initialize a lock instance's lock-class mapping info:
*/
void lockdep_init_map(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key)
+ struct lock_class_key *key, int subclass)
{
if (unlikely(!debug_locks))
return;
@@ -1944,6 +1956,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->name = name;
lock->key = key;
lock->class_cache = NULL;
+ if (subclass)
+ register_lock_class(lock, subclass, 1);
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1982,7 +1996,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* Not cached yet or subclass?
*/
if (unlikely(!class)) {
- class = register_lock_class(lock, subclass);
+ class = register_lock_class(lock, subclass, 0);
if (!class)
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index b7fe6e840963..67009bd56c52 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod)
return try_module_get(mod);
}
+static inline void add_taint_module(struct module *mod, unsigned flag)
+{
+ add_taint(flag);
+ mod->taints |= flag;
+}
+
/* A thread that wants to hold a reference to a module only while it
* is running can call ths to safely exit.
* nfsd and lockd use this.
@@ -847,11 +853,10 @@ static int check_version(Elf_Shdr *sechdrs,
return 0;
}
/* Not in module's version table. OK, but that taints the kernel. */
- if (!(tainted & TAINT_FORCED_MODULE)) {
+ if (!(tainted & TAINT_FORCED_MODULE))
printk("%s: no version for \"%s\" found: kernel tainted.\n",
mod->name, symname);
- add_taint(TAINT_FORCED_MODULE);
- }
+ add_taint_module(mod, TAINT_FORCED_MODULE);
return 1;
}
@@ -909,7 +914,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
unsigned long ret;
const unsigned long *crc;
- ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
+ ret = __find_symbol(name, &owner, &crc,
+ !(mod->taints & TAINT_PROPRIETARY_MODULE));
if (ret) {
/* use_module can fail due to OOM, or module unloading */
if (!check_version(sechdrs, versindex, name, mod, crc) ||
@@ -933,6 +939,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
return sprintf(buf, "0x%lx\n", sattr->address);
}
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs);
+}
+
static void add_sect_attrs(struct module *mod, unsigned int nsect,
char *secstrings, Elf_Shdr *sechdrs)
{
@@ -949,21 +964,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
+ nloaded * sizeof(sect_attrs->attrs[0]),
sizeof(sect_attrs->grp.attrs[0]));
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
- if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (sect_attrs == NULL)
return;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+ sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
gattr = &sect_attrs->grp.attrs[0];
for (i = 0; i < nsect; i++) {
if (! (sechdrs[i].sh_flags & SHF_ALLOC))
continue;
sattr->address = sechdrs[i].sh_addr;
- strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
- MODULE_SECT_NAME_LEN);
+ sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ GFP_KERNEL);
+ if (sattr->name == NULL)
+ goto out;
+ sect_attrs->nsections++;
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
@@ -979,7 +999,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
mod->sect_attrs = sect_attrs;
return;
out:
- kfree(sect_attrs);
+ free_sect_attrs(sect_attrs);
}
static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1009,13 @@ static void remove_sect_attrs(struct module *mod)
&mod->sect_attrs->grp);
/* We are positive that no one is using any sect attrs
* at this point. Deallocate immediately. */
- kfree(mod->sect_attrs);
+ free_sect_attrs(mod->sect_attrs);
mod->sect_attrs = NULL;
}
}
-
#else
+
static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
char *sectstrings, Elf_Shdr *sechdrs)
{
@@ -1320,11 +1340,11 @@ static void set_license(struct module *mod, const char *license)
if (!license)
license = "unspecified";
- mod->license_gplok = license_is_gpl_compatible(license);
- if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
- printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
- mod->name, license);
- add_taint(TAINT_PROPRIETARY_MODULE);
+ if (!license_is_gpl_compatible(license)) {
+ if (!(tainted & TAINT_PROPRIETARY_MODULE))
+ printk(KERN_WARNING "%s: module license '%s' taints"
+ "kernel.\n", mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
}
}
@@ -1603,7 +1623,7 @@ static struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
- add_taint(TAINT_FORCED_MODULE);
+ add_taint_module(mod, TAINT_FORCED_MODULE);
printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
mod->name);
} else if (!same_magic(modmagic, vermagic)) {
@@ -1698,9 +1718,9 @@ static struct module *load_module(void __user *umod,
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
if (strcmp(mod->name, "driverloader") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, sechdrs, infoindex);
@@ -1745,7 +1765,7 @@ static struct module *load_module(void __user *umod,
(mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
printk(KERN_WARNING "%s: No versions for exported symbols."
" Tainting kernel.\n", mod->name);
- add_taint(TAINT_FORCED_MODULE);
+ add_taint_module(mod, TAINT_FORCED_MODULE);
}
#endif
@@ -2018,7 +2038,8 @@ const char *module_address_lookup(unsigned long addr,
list_for_each_entry(mod, &modules, list) {
if (within(addr, mod->module_init, mod->init_size)
|| within(addr, mod->module_core, mod->core_size)) {
- *modname = mod->name;
+ if (modname)
+ *modname = mod->name;
return get_ksymbol(mod, addr, size, offset);
}
}
@@ -2109,9 +2130,33 @@ static void m_stop(struct seq_file *m, void *p)
mutex_unlock(&module_mutex);
}
+static char *taint_flags(unsigned int taints, char *buf)
+{
+ int bx = 0;
+
+ if (taints) {
+ buf[bx++] = '(';
+ if (taints & TAINT_PROPRIETARY_MODULE)
+ buf[bx++] = 'P';
+ if (taints & TAINT_FORCED_MODULE)
+ buf[bx++] = 'F';
+ /*
+ * TAINT_FORCED_RMMOD: could be added.
+ * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * apply to modules.
+ */
+ buf[bx++] = ')';
+ }
+ buf[bx] = '\0';
+
+ return buf;
+}
+
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
+ char buf[8];
+
seq_printf(m, "%s %lu",
mod->name, mod->init_size + mod->core_size);
print_unload_info(m, mod);
@@ -2124,6 +2169,10 @@ static int m_show(struct seq_file *m, void *p)
/* Used by oprofile and other similar tools. */
seq_printf(m, " 0x%p", mod->module_core);
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", taint_flags(mod->taints, buf));
+
seq_printf(m, "\n");
return 0;
}
@@ -2216,10 +2265,11 @@ struct module *module_text_address(unsigned long addr)
void print_modules(void)
{
struct module *mod;
+ char buf[8];
printk("Modules linked in:");
list_for_each_entry(mod, &modules, list)
- printk(" %s", mod->name);
+ printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
printk("\n");
}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index e3203c654dda..18651641a7b5 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -91,7 +91,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key);
+ lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
lock->owner = NULL;
lock->magic = lock;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 000000000000..674aceb7335a
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * Jun 2006 - namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+#include <linux/init_task.h>
+#include <linux/namespace.h>
+#include <linux/utsname.h>
+
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+
+static inline void get_nsproxy(struct nsproxy *ns)
+{
+ atomic_inc(&ns->count);
+}
+
+void get_task_namespaces(struct task_struct *tsk)
+{
+ struct nsproxy *ns = tsk->nsproxy;
+ if (ns) {
+ get_nsproxy(ns);
+ }
+}
+
+/*
+ * creates a copy of "orig" with refcount 1.
+ * This does not grab references to the contained namespaces,
+ * so that needs to be done by dup_namespaces.
+ */
+static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+{
+ struct nsproxy *ns;
+
+ ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
+ if (ns)
+ atomic_set(&ns->count, 1);
+ return ns;
+}
+
+/*
+ * copies the nsproxy, setting refcount to 1, and grabbing a
+ * reference to all contained namespaces. Called from
+ * sys_unshare()
+ */
+struct nsproxy *dup_namespaces(struct nsproxy *orig)
+{
+ struct nsproxy *ns = clone_namespaces(orig);
+
+ if (ns) {
+ if (ns->namespace)
+ get_namespace(ns->namespace);
+ if (ns->uts_ns)
+ get_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ get_ipc_ns(ns->ipc_ns);
+ }
+
+ return ns;
+}
+
+/*
+ * called from clone. This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(int flags, struct task_struct *tsk)
+{
+ struct nsproxy *old_ns = tsk->nsproxy;
+ struct nsproxy *new_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_nsproxy(old_ns);
+
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ return 0;
+
+ new_ns = clone_namespaces(old_ns);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ tsk->nsproxy = new_ns;
+
+ err = copy_namespace(flags, tsk);
+ if (err)
+ goto out_ns;
+
+ err = copy_utsname(flags, tsk);
+ if (err)
+ goto out_uts;
+
+ err = copy_ipcs(flags, tsk);
+ if (err)
+ goto out_ipc;
+
+out:
+ put_nsproxy(old_ns);
+ return err;
+
+out_ipc:
+ if (new_ns->uts_ns)
+ put_uts_ns(new_ns->uts_ns);
+out_uts:
+ if (new_ns->namespace)
+ put_namespace(new_ns->namespace);
+out_ns:
+ tsk->nsproxy = old_ns;
+ kfree(new_ns);
+ goto out;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+ if (ns->namespace)
+ put_namespace(ns->namespace);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ put_ipc_ns(ns->ipc_ns);
+ kfree(ns);
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index 6ceb664fb52a..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,7 +21,6 @@
#include <linux/debug_locks.h>
int panic_on_oops;
-int panic_on_unrecovered_nmi;
int tainted;
static int pause_on_oops;
static int pause_on_oops_flag;
diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532e..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
unsigned int name_skip)
{
struct module_kobject *mk;
+ int ret;
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
kobject_set_name(&mk->kobj, name);
- kobject_register(&mk->kobj);
+ ret = kobject_register(&mk->kobj);
+ BUG_ON(ret < 0);
/* no need to keep the kobject if no parameter is exported */
if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
*/
static int __init param_sysfs_init(void)
{
- subsystem_register(&module_subsys);
+ int ret;
+
+ ret = subsystem_register(&module_subsys);
+ if (ret < 0) {
+ printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+ __FILE__, __LINE__, ret);
+ return ret;
+ }
param_sysfs_builtin();
return 0;
}
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index 93e212f20671..b914392085f9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
+#include <linux/pspace.h>
#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
static struct hlist_head *pid_hash;
@@ -33,17 +34,20 @@ static int pidhash_shift;
static kmem_cache_t *pid_cachep;
int pid_max = PID_MAX_DEFAULT;
-int last_pid;
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
+
+static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+{
+ return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+}
+
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
@@ -53,13 +57,12 @@ int pid_max_max = PID_MAX_LIMIT;
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
-typedef struct pidmap {
- atomic_t nr_free;
- void *page;
-} pidmap_t;
-
-static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
- { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+struct pspace init_pspace = {
+ .pidmap = {
+ [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
+ },
+ .last_pid = 0
+};
/*
* Note: disable interrupts while the pidmap_lock is held as an
@@ -74,40 +77,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
* irq handlers that take it we can leave the interrupts enabled.
* For now it is easier to be safe than to prove it can't happen.
*/
+
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(struct pspace *pspace, int pid)
{
- pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+ struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
-static int alloc_pidmap(void)
+static int alloc_pidmap(struct pspace *pspace)
{
- int i, offset, max_scan, pid, last = last_pid;
- pidmap_t *map;
+ int i, offset, max_scan, pid, last = pspace->last_pid;
+ struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
- map = &pidmap_array[pid/BITS_PER_PAGE];
+ map = &pspace->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/*
* Free the page if someone raced with us
* installing it:
*/
spin_lock_irq(&pidmap_lock);
if (map->page)
- free_page(page);
+ kfree(page);
else
- map->page = (void *)page;
+ map->page = page;
spin_unlock_irq(&pidmap_lock);
if (unlikely(!map->page))
break;
@@ -116,11 +120,11 @@ static int alloc_pidmap(void)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- last_pid = pid;
+ pspace->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
- pid = mk_pid(map, offset);
+ pid = mk_pid(pspace, map, offset);
/*
* find_next_offset() found a bit, the pid from it
* is in-bounds, and if we fell back to the last
@@ -131,16 +135,34 @@ static int alloc_pidmap(void)
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
- if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
+ if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
- map = &pidmap_array[0];
+ map = &pspace->pidmap[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
- pid = mk_pid(map, offset);
+ pid = mk_pid(pspace, map, offset);
+ }
+ return -1;
+}
+
+static int next_pidmap(struct pspace *pspace, int last)
+{
+ int offset;
+ struct pidmap *map, *end;
+
+ offset = (last + 1) & BITS_PER_PAGE_MASK;
+ map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
+ end = &pspace->pidmap[PIDMAP_ENTRIES];
+ for (; map < end; map++, offset = 0) {
+ if (unlikely(!map->page))
+ continue;
+ offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+ if (offset < BITS_PER_PAGE)
+ return mk_pid(pspace, map, offset);
}
return -1;
}
@@ -153,6 +175,7 @@ fastcall void put_pid(struct pid *pid)
atomic_dec_and_test(&pid->count))
kmem_cache_free(pid_cachep, pid);
}
+EXPORT_SYMBOL_GPL(put_pid);
static void delayed_put_pid(struct rcu_head *rhp)
{
@@ -169,7 +192,7 @@ fastcall void free_pid(struct pid *pid)
hlist_del_rcu(&pid->pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);
- free_pidmap(pid->nr);
+ free_pidmap(&init_pspace, pid->nr);
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -183,7 +206,7 @@ struct pid *alloc_pid(void)
if (!pid)
goto out;
- nr = alloc_pidmap();
+ nr = alloc_pidmap(&init_pspace);
if (nr < 0)
goto out_free;
@@ -217,15 +240,13 @@ struct pid * fastcall find_pid(int nr)
}
return NULL;
}
+EXPORT_SYMBOL_GPL(find_pid);
int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
{
struct pid_link *link;
struct pid *pid;
- WARN_ON(!task->pid); /* to be removed soon */
- WARN_ON(!nr); /* to be removed soon */
-
link = &task->pids[type];
link->pid = pid = find_pid(nr);
hlist_add_head_rcu(&link->node, &pid->tasks[type]);
@@ -252,6 +273,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
free_pid(pid);
}
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+ enum pid_type type)
+{
+ new->pids[type].pid = old->pids[type].pid;
+ hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+ old->pids[type].pid = NULL;
+}
+
struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
@@ -274,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr)
EXPORT_SYMBOL(find_task_by_pid_type);
+struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid;
+ rcu_read_lock();
+ pid = get_pid(task->pids[type].pid);
+ rcu_read_unlock();
+ return pid;
+}
+
struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result;
@@ -297,6 +336,26 @@ struct pid *find_get_pid(pid_t nr)
}
/*
+ * Used by proc to find the first pid that is greater then or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid.
+ */
+struct pid *find_ge_pid(int nr)
+{
+ struct pid *pid;
+
+ do {
+ pid = find_pid(nr);
+ if (pid)
+ break;
+ nr = next_pidmap(&init_pspace, nr);
+ } while (nr > 0);
+
+ return pid;
+}
+EXPORT_SYMBOL_GPL(find_get_pid);
+
+/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
* more.
@@ -323,10 +382,10 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+ init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, pidmap_array->page);
- atomic_dec(&pidmap_array->nr_free);
+ set_bit(0, init_pspace.pidmap[0].page);
+ atomic_dec(&init_pspace.pidmap[0].nr_free);
pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
__alignof__(struct pid),
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..7c3e1e6dfb5b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
}
/*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+ cputime_t res = cputime_div(time, div);
+
+ return max_t(cputime_t, res, 1);
+}
+
+/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
@@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p,
BUG();
break;
case CPUCLOCK_PROF:
- left = cputime_div(cputime_sub(expires.cpu, val.cpu),
- nthreads);
+ left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+ nthreads);
do {
if (likely(!(t->flags & PF_EXITING))) {
ticks = cputime_add(prof_ticks(t), left);
@@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p,
} while (t != p);
break;
case CPUCLOCK_VIRT:
- left = cputime_div(cputime_sub(expires.cpu, val.cpu),
- nthreads);
+ left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+ nthreads);
do {
if (likely(!(t->flags & PF_EXITING))) {
ticks = cputime_add(virt_ticks(t), left);
@@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p,
case CPUCLOCK_SCHED:
nsleft = expires.sched - val.sched;
do_div(nsleft, nthreads);
+ nsleft = max_t(unsigned long long, nsleft, 1);
do {
if (likely(!(t->flags & PF_EXITING))) {
ns = t->sched_time + nsleft;
@@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk,
prof_left = cputime_sub(prof_expires, utime);
prof_left = cputime_sub(prof_left, stime);
- prof_left = cputime_div(prof_left, nthreads);
+ prof_left = cputime_div_non_zero(prof_left, nthreads);
virt_left = cputime_sub(virt_expires, utime);
- virt_left = cputime_div(virt_left, nthreads);
+ virt_left = cputime_div_non_zero(virt_left, nthreads);
if (sched_expires) {
sched_left = sched_expires - sched_time;
do_div(sched_left, nthreads);
+ sched_left = max_t(unsigned long long, sched_left, 1);
} else {
sched_left = 0;
}
@@ -1393,25 +1408,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
}
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct itimerspec *it)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
struct k_itimer timer;
int error;
/*
- * Diagnose required errors first.
- */
- if (CPUCLOCK_PERTHREAD(which_clock) &&
- (CPUCLOCK_PID(which_clock) == 0 ||
- CPUCLOCK_PID(which_clock) == current->pid))
- return -EINVAL;
-
- /*
* Set up a temporary timer and then wait for it to go off.
*/
memset(&timer, 0, sizeof timer);
@@ -1422,11 +1425,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
timer.it_process = current;
if (!error) {
static struct itimerspec zero_it;
- struct itimerspec it = { .it_value = *rqtp,
- .it_interval = {} };
+
+ memset(it, 0, sizeof *it);
+ it->it_value = *rqtp;
spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+ error = posix_cpu_timer_set(&timer, flags, it, NULL);
if (error) {
spin_unlock_irq(&timer.it_lock);
return error;
@@ -1454,49 +1458,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+ posix_cpu_timer_set(&timer, 0, &zero_it, it);
spin_unlock_irq(&timer.it_lock);
- if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+ if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
*/
return 0;
}
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+ struct itimerspec it;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+
+ if (flags & TIMER_ABSTIME)
+ return -ERESTARTNOHAND;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
- copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
- restart_block->fn = posix_cpu_clock_nanosleep_restart;
- /* Caller already set restart_block->arg1 */
+ restart_block->fn = posix_cpu_nsleep_restart;
restart_block->arg0 = which_clock;
restart_block->arg1 = (unsigned long) rmtp;
restart_block->arg2 = rqtp->tv_sec;
restart_block->arg3 = rqtp->tv_nsec;
-
- error = -ERESTART_RESTARTBLOCK;
}
-
return error;
}
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->arg0;
struct timespec __user *rmtp;
struct timespec t;
+ struct itimerspec it;
+ int error;
rmtp = (struct timespec __user *) restart_block->arg1;
t.tv_sec = restart_block->arg2;
t.tv_nsec = restart_block->arg3;
restart_block->fn = do_no_restart_syscall;
- return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+ error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->arg0 = which_clock;
+ restart_block->arg1 = (unsigned long) rmtp;
+ restart_block->arg2 = t.tv_sec;
+ restart_block->arg3 = t.tv_nsec;
+ }
+ return error;
+
}
@@ -1524,6 +1568,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
{
return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static int thread_cpu_clock_getres(const clockid_t which_clock,
struct timespec *tp)
{
@@ -1544,6 +1592,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
{
return -EINVAL;
}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static __init int init_posix_cpu_timers(void)
{
@@ -1553,6 +1605,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = process_cpu_timer_create,
.nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
};
struct k_clock thread = {
.clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1613,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = thread_cpu_timer_create,
.nsleep = thread_cpu_nsleep,
+ .nsleep_restart = thread_cpu_nsleep_restart,
};
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..9cbb5d1be06f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1,5 +1,5 @@
/*
- * linux/kernel/posix_timers.c
+ * linux/kernel/posix-timers.c
*
*
* 2002-10-15 Posix Clocks & timers
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
return CLOCK_DISPATCH(which_clock, nsleep,
(which_clock, flags, &t, rmtp));
}
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+ return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+
+ return CLOCK_DISPATCH(which_clock, nsleep_restart,
+ (restart_block));
+}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d72234942798..d3a158a60312 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/console.h>
#include <linux/cpu.h>
#include "power.h"
@@ -119,8 +120,10 @@ int pm_suspend_disk(void)
if (error)
return error;
+ suspend_console();
error = device_suspend(PMSG_FREEZE);
if (error) {
+ resume_console();
printk("Some devices failed to suspend\n");
unprepare_processes();
return error;
@@ -133,6 +136,7 @@ int pm_suspend_disk(void)
if (in_suspend) {
device_resume();
+ resume_console();
pr_debug("PM: writing image.\n");
error = swsusp_write();
if (!error)
@@ -148,6 +152,7 @@ int pm_suspend_disk(void)
swsusp_free();
Done:
device_resume();
+ resume_console();
unprepare_processes();
return error;
}
@@ -212,7 +217,9 @@ static int software_resume(void)
pr_debug("PM: Preparing devices for restore.\n");
+ suspend_console();
if ((error = device_suspend(PMSG_PRETHAW))) {
+ resume_console();
printk("Some devices failed to suspend\n");
swsusp_free();
goto Thaw;
@@ -224,6 +231,7 @@ static int software_resume(void)
swsusp_resume();
pr_debug("PM: Restore failed, recovering.n");
device_resume();
+ resume_console();
Thaw:
unprepare_processes();
Done:
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7a4144ba3afd..f1f900ac3164 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -23,8 +23,7 @@ static void do_poweroff(void *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
-static void handle_poweroff(int key, struct pt_regs *pt_regs,
- struct tty_struct *tty)
+static void handle_poweroff(int key, struct tty_struct *tty)
{
schedule_work(&poweroff_work);
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1b84313cbab5..99f9b7d177d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info)
memset(info, 0, sizeof(struct swsusp_info));
info->version_code = LINUX_VERSION_CODE;
info->num_physpages = num_physpages;
- memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
@@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info)
reason = "kernel version";
if (info->num_physpages != num_physpages)
reason = "memory size";
- if (strcmp(info->uts.sysname,system_utsname.sysname))
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
reason = "system type";
- if (strcmp(info->uts.release,system_utsname.release))
+ if (strcmp(info->uts.release,init_utsname()->release))
reason = "kernel release";
- if (strcmp(info->uts.version,system_utsname.version))
+ if (strcmp(info->uts.version,init_utsname()->version))
reason = "version";
- if (strcmp(info->uts.machine,system_utsname.machine))
+ if (strcmp(info->uts.machine,init_utsname()->machine))
reason = "machine";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9b2ee5344dee..1a3b0dd2c3fc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -425,7 +425,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
bio_set_pages_dirty(bio);
bio_put(bio);
} else {
- get_page(page);
+ if (rw == READ)
+ get_page(page); /* These pages are freed later */
bio->bi_private = *bio_chain;
*bio_chain = bio;
submit_bio(rw | (1 << BIO_RW_SYNC), bio);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 72825c853cd7..d991d3b0e5a4 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/fs.h>
+#include <linux/console.h>
#include <linux/cpu.h>
#include <asm/uaccess.h>
@@ -145,10 +146,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = freeze_processes();
if (error) {
thaw_processes();
+ enable_nonboot_cpus();
error = -EBUSY;
}
}
- enable_nonboot_cpus();
up(&pm_sem);
if (!error)
data->frozen = 1;
@@ -173,12 +174,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
if (!error) {
+ suspend_console();
error = device_suspend(PMSG_FREEZE);
if (!error) {
in_suspend = 1;
error = swsusp_suspend();
device_resume();
}
+ resume_console();
}
up(&pm_sem);
if (!error)
@@ -196,11 +199,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
snapshot_free_unused_memory(&data->handle);
down(&pm_sem);
pm_prepare_console();
+ suspend_console();
error = device_suspend(PMSG_PRETHAW);
if (!error) {
error = swsusp_resume();
device_resume();
}
+ resume_console();
pm_restore_console();
up(&pm_sem);
break;
@@ -289,6 +294,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
}
/* Put devices to sleep */
+ suspend_console();
error = device_suspend(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "Failed to suspend some devices.\n");
@@ -299,7 +305,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
/* Wake up devices */
device_resume();
}
-
+ resume_console();
if (pm_ops->finish)
pm_ops->finish(PM_SUSPEND_MEM);
diff --git a/kernel/printk.c b/kernel/printk.c
index 771f5e861bcd..f7d427ef5038 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -820,15 +820,8 @@ void release_console_sem(void)
console_locked = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
- /*
- * If we printk from within the lock dependency code,
- * from within the scheduler code, then do not lock
- * up due to self-recursion:
- */
- if (!lockdep_internal())
- wake_up_interruptible(&log_wait);
- }
+ if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
+ wake_up_interruptible(&log_wait);
}
EXPORT_SYMBOL(release_console_sem);
diff --git a/kernel/profile.c b/kernel/profile.c
index fb660c7d35ba..f940b462eec9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -25,6 +25,7 @@
#include <linux/mutex.h>
#include <asm/sections.h>
#include <asm/semaphore.h>
+#include <asm/irq_regs.h>
struct profile_hit {
u32 pc, hits;
@@ -366,8 +367,10 @@ void profile_hit(int type, void *__pc)
}
#endif /* !CONFIG_SMP */
-void profile_tick(int type, struct pt_regs *regs)
+void profile_tick(int type)
{
+ struct pt_regs *regs = get_irq_regs();
+
if (type == CPU_PROFILING && timer_hook)
timer_hook(regs);
if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
@@ -396,7 +399,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe
unsigned long full_count = count, err;
cpumask_t new_value;
- err = cpumask_parse(buffer, count, new_value);
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
return err;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a111f70145c..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
return 0;
}
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- struct page *page;
- void *old_buf = buf;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
- down_read(&mm->mmap_sem);
- /* ignore errors, just check how much was sucessfully transfered */
- while (len) {
- int bytes, ret, offset;
- void *maddr;
-
- ret = get_user_pages(tsk, mm, addr, 1,
- write, 1, &page, &vma);
- if (ret <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
- } else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
- }
- kunmap(page);
- page_cache_release(page);
- len -= bytes;
- buf += bytes;
- addr += bytes;
- }
- up_read(&mm->mmap_sem);
- mmput(mm);
-
- return buf - old_buf;
-}
-
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
@@ -494,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
child = find_task_by_pid(pid);
if (child)
get_task_struct(child);
+
read_unlock(&tasklist_lock);
if (!child)
return ERR_PTR(-ESRCH);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 523e46483b99..26bb5ffe1ef1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -71,9 +71,6 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
static int blimit = 10;
static int qhimark = 10000;
static int qlowmark = 100;
-#ifdef CONFIG_SMP
-static int rsinterval = 1000;
-#endif
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -86,8 +83,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
int cpu;
cpumask_t cpumask;
set_need_resched();
- if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
- rdp->last_rs_qlen = rdp->qlen;
+ if (unlikely(!rcp->signaled)) {
+ rcp->signaled = 1;
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
@@ -301,6 +298,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
smp_mb();
cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+ rcp->signaled = 0;
}
}
@@ -628,9 +626,6 @@ void synchronize_rcu(void)
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
-#ifdef CONFIG_SMP
-module_param(rsinterval, int, 0);
-#endif
EXPORT_SYMBOL_GPL(rcu_batches_completed);
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
EXPORT_SYMBOL_GPL(call_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d247127..e2bda18f6f42 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -15,9 +15,10 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
- * Copyright (C) IBM Corporation, 2005
+ * Copyright (C) IBM Corporation, 2005, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Josh Triplett <josh@freedesktop.org>
*
* See also: Documentation/RCU/torture.txt
*/
@@ -44,19 +45,25 @@
#include <linux/delay.h>
#include <linux/byteorder/swabb.h>
#include <linux/stat.h>
+#include <linux/srcu.h>
MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+ "Josh Triplett <josh@freedesktop.org>");
-static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
+static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
+static int nfakewriters = 4; /* # fake writer threads */
static int stat_interval; /* Interval between stats, in seconds. */
/* Defaults to "only at end of test". */
static int verbose; /* Print more debug info. */
static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-static char *torture_type = "rcu"; /* What to torture. */
+static char *torture_type = "rcu"; /* What RCU implementation to torture. */
module_param(nreaders, int, 0);
MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+module_param(nfakewriters, int, 0);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
module_param(stat_interval, int, 0);
MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
module_param(verbose, bool, 0);
@@ -66,7 +73,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
module_param(shuffle_interval, int, 0);
MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
module_param(torture_type, charp, 0);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
#define TORTURE_FLAG "-torture:"
#define PRINTK_STRING(s) \
@@ -80,6 +87,7 @@ static char printk_buf[4096];
static int nrealreaders;
static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
static struct task_struct *shuffler_task;
@@ -104,11 +112,12 @@ static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
{ 0 };
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
-atomic_t n_rcu_torture_alloc;
-atomic_t n_rcu_torture_alloc_fail;
-atomic_t n_rcu_torture_free;
-atomic_t n_rcu_torture_mberror;
-atomic_t n_rcu_torture_error;
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static struct list_head rcu_torture_removed;
/*
* Allocate an element from the rcu_tortures pool.
@@ -145,7 +154,7 @@ rcu_torture_free(struct rcu_torture *p)
struct rcu_random_state {
unsigned long rrs_state;
- unsigned long rrs_count;
+ long rrs_count;
};
#define RCU_RANDOM_MULT 39916801 /* prime */
@@ -158,7 +167,7 @@ struct rcu_random_state {
* Crude but fast random-number generator. Uses a linear congruential
* generator, with occasional help from get_random_bytes().
*/
-static long
+static unsigned long
rcu_random(struct rcu_random_state *rrsp)
{
long refresh;
@@ -180,9 +189,11 @@ struct rcu_torture_ops {
void (*init)(void);
void (*cleanup)(void);
int (*readlock)(void);
+ void (*readdelay)(struct rcu_random_state *rrsp);
void (*readunlock)(int idx);
int (*completed)(void);
void (*deferredfree)(struct rcu_torture *p);
+ void (*sync)(void);
int (*stats)(char *page);
char *name;
};
@@ -192,13 +203,25 @@ static struct rcu_torture_ops *cur_ops = NULL;
* Definitions for rcu torture testing.
*/
-static int rcu_torture_read_lock(void)
+static int rcu_torture_read_lock(void) __acquires(RCU)
{
rcu_read_lock();
return 0;
}
-static void rcu_torture_read_unlock(int idx)
+static void rcu_read_delay(struct rcu_random_state *rrsp)
+{
+ long delay;
+ const long longdelay = 200;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay);
+ if (!delay)
+ udelay(longdelay);
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
{
rcu_read_unlock();
}
@@ -239,24 +262,65 @@ static struct rcu_torture_ops rcu_ops = {
.init = NULL,
.cleanup = NULL,
.readlock = rcu_torture_read_lock,
+ .readdelay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.completed = rcu_torture_completed,
.deferredfree = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
.stats = NULL,
.name = "rcu"
};
+static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
+{
+ int i;
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ cur_ops->sync();
+ list_add(&p->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void rcu_sync_torture_init(void)
+{
+ INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_sync_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .readdelay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .stats = NULL,
+ .name = "rcu_sync"
+};
+
/*
* Definitions for rcu_bh torture testing.
*/
-static int rcu_bh_torture_read_lock(void)
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
{
rcu_read_lock_bh();
return 0;
}
-static void rcu_bh_torture_read_unlock(int idx)
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
{
rcu_read_unlock_bh();
}
@@ -271,19 +335,176 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
}
+struct rcu_bh_torture_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
+static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
+{
+ struct rcu_bh_torture_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
+ complete(&rcu->completion);
+}
+
+static void rcu_bh_torture_synchronize(void)
+{
+ struct rcu_bh_torture_synchronize rcu;
+
+ init_completion(&rcu.completion);
+ call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
+ wait_for_completion(&rcu.completion);
+}
+
static struct rcu_torture_ops rcu_bh_ops = {
.init = NULL,
.cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_bh_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
.stats = NULL,
.name = "rcu_bh"
};
+static struct rcu_torture_ops rcu_bh_sync_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .stats = NULL,
+ .name = "rcu_bh_sync"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+static struct srcu_struct srcu_ctl;
+
+static void srcu_torture_init(void)
+{
+ init_srcu_struct(&srcu_ctl);
+ rcu_sync_torture_init();
+}
+
+static void srcu_torture_cleanup(void)
+{
+ synchronize_srcu(&srcu_ctl);
+ cleanup_srcu_struct(&srcu_ctl);
+}
+
+static int srcu_torture_read_lock(void)
+{
+ return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct rcu_random_state *rrsp)
+{
+ long delay;
+ const long uspertick = 1000000 / HZ;
+ const long longdelay = 10;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+ if (!delay)
+ schedule_timeout_interruptible(longdelay);
+}
+
+static void srcu_torture_read_unlock(int idx)
+{
+ srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+ return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_synchronize(void)
+{
+ synchronize_srcu(&srcu_ctl);
+}
+
+static int srcu_torture_stats(char *page)
+{
+ int cnt = 0;
+ int cpu;
+ int idx = srcu_ctl.completed & 0x1;
+
+ cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+ per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
+ per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+ }
+ cnt += sprintf(&page[cnt], "\n");
+ return cnt;
+}
+
+static struct rcu_torture_ops srcu_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .readdelay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static int sched_torture_completed(void)
+{
+ return 0;
+}
+
+static void sched_torture_synchronize(void)
+{
+ synchronize_sched();
+}
+
+static struct rcu_torture_ops sched_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .stats = NULL,
+ .name = "sched"
+};
+
static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_bh_ops, NULL };
+ { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
+ &sched_ops, NULL };
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
@@ -330,6 +551,30 @@ rcu_torture_writer(void *arg)
}
/*
+ * RCU torture fake writer kthread. Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+ DEFINE_RCU_RANDOM(rand);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, 19);
+
+ do {
+ schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
+ udelay(rcu_random(&rand) & 0x3ff);
+ cur_ops->sync();
+ } while (!kthread_should_stop() && !fullstop);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ return 0;
+}
+
+/*
* RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
* counter in the element should never be greater than 1, otherwise, the
@@ -359,7 +604,7 @@ rcu_torture_reader(void *arg)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- udelay(rcu_random(&rand) & 0x7f);
+ cur_ops->readdelay(&rand);
preempt_disable();
pipe_count = p->rtort_pipe_count;
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -483,7 +728,7 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
* is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
*/
-void rcu_torture_shuffle_tasks(void)
+static void rcu_torture_shuffle_tasks(void)
{
cpumask_t tmp_mask = CPU_MASK_ALL;
int i;
@@ -507,6 +752,12 @@ void rcu_torture_shuffle_tasks(void)
set_cpus_allowed(reader_tasks[i], tmp_mask);
}
+ if (fakewriter_tasks != NULL) {
+ for (i = 0; i < nfakewriters; i++)
+ if (fakewriter_tasks[i])
+ set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
+ }
+
if (writer_task)
set_cpus_allowed(writer_task, tmp_mask);
@@ -540,11 +791,12 @@ rcu_torture_shuffle(void *arg)
static inline void
rcu_torture_print_module_parms(char *tag)
{
- printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval = %d\n",
- torture_type, tag, nrealreaders, stat_interval, verbose,
- test_no_idle_hz, shuffle_interval);
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval);
}
static void
@@ -579,6 +831,19 @@ rcu_torture_cleanup(void)
}
rcu_torture_current = NULL;
+ if (fakewriter_tasks != NULL) {
+ for (i = 0; i < nfakewriters; i++) {
+ if (fakewriter_tasks[i] != NULL) {
+ VERBOSE_PRINTK_STRING(
+ "Stopping rcu_torture_fakewriter task");
+ kthread_stop(fakewriter_tasks[i]);
+ }
+ fakewriter_tasks[i] = NULL;
+ }
+ kfree(fakewriter_tasks);
+ fakewriter_tasks = NULL;
+ }
+
if (stats_task != NULL) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
kthread_stop(stats_task);
@@ -666,7 +931,25 @@ rcu_torture_init(void)
writer_task = NULL;
goto unwind;
}
- reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+ fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_PRINTK_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nfakewriters; i++) {
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+ fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+ "rcu_torture_fakewriter");
+ if (IS_ERR(fakewriter_tasks[i])) {
+ firsterr = PTR_ERR(fakewriter_tasks[i]);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
+ fakewriter_tasks[i] = NULL;
+ goto unwind;
+ }
+ }
+ reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_PRINTK_ERRSTRING("out of memory");
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..f04bbdb56ac2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
* @buf: the buffer struct
* @size: total size of the buffer
*
- * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
* passed in size will get page aligned, if it isn't already.
*/
static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
/**
* relay_create_buf - allocate and initialize a channel buffer
- * @alloc_size: size of the buffer to allocate
- * @n_subbufs: number of sub-buffers in the channel
+ * @chan: the relay channel
*
- * Returns channel buffer if successful, NULL otherwise
+ * Returns channel buffer if successful, %NULL otherwise.
*/
struct rchan_buf *relay_create_buf(struct rchan *chan)
{
@@ -163,6 +162,7 @@ free_buf:
/**
* relay_destroy_channel - free the channel struct
+ * @kref: target kernel reference that contains the relay channel
*
* Should only be called from kref_put().
*/
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
/**
* relay_remove_buf - remove a channel buffer
+ * @kref: target kernel reference that contains the relay buffer
*
* Removes the file from the fileystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
}
EXPORT_SYMBOL_GPL(relay_reset);
-/**
+/*
* relay_open_buf - create a new relay channel buffer
*
* Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
/**
* relay_open - create a new relay channel
* @base_filename: base name of files to create
- * @parent: dentry of parent directory, NULL for root directory
+ * @parent: dentry of parent directory, %NULL for root directory
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
*
- * Returns channel pointer if successful, NULL otherwise.
+ * Returns channel pointer if successful, %NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
- * NOTE: kernel clients don't need to call this function if the channel
+ * NOTE: Kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
* relay_flush - close the channel
* @chan: the channel
*
- * Flushes all channel buffers i.e. forces buffer switch.
+ * Flushes all channel buffers, i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
@@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
*/
static int relay_file_open(struct inode *inode, struct file *filp)
{
- struct rchan_buf *buf = inode->u.generic_ip;
+ struct rchan_buf *buf = inode->i_private;
kref_get(&buf->kref);
filp->private_data = buf;
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
return 0;
}
-/**
+/*
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
}
}
-/**
+/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*
* If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
/**
* relay_file_read_end_pos - return the new read position
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ * @count: number of bytes to be read
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
return end_pos;
}
-/**
+/*
* subbuf_read_actor - read up to one subbuf's worth of data
*/
static int subbuf_read_actor(size_t read_start,
@@ -879,7 +887,7 @@ static int subbuf_read_actor(size_t read_start,
from = buf->start + read_start;
ret = avail;
- if (copy_to_user(desc->arg.data, from, avail)) {
+ if (copy_to_user(desc->arg.buf, from, avail)) {
desc->error = -EFAULT;
ret = 0;
}
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
return ret;
}
-/**
+/*
* subbuf_send_actor - send up to one subbuf's worth of data
*/
static int subbuf_send_actor(size_t read_start,
@@ -933,29 +941,22 @@ typedef int (*subbuf_actor_t) (size_t read_start,
read_descriptor_t *desc,
read_actor_t actor);
-/**
+/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static inline ssize_t relay_file_read_subbufs(struct file *filp,
loff_t *ppos,
- size_t count,
subbuf_actor_t subbuf_actor,
read_actor_t actor,
- void *target)
+ read_descriptor_t *desc)
{
struct rchan_buf *buf = filp->private_data;
size_t read_start, avail;
- read_descriptor_t desc;
int ret;
- if (!count)
+ if (!desc->count)
return 0;
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
mutex_lock(&filp->f_dentry->d_inode->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
@@ -966,19 +967,19 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
if (!avail)
break;
- avail = min(desc.count, avail);
- ret = subbuf_actor(read_start, buf, avail, &desc, actor);
- if (desc.error < 0)
+ avail = min(desc->count, avail);
+ ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ if (desc->error < 0)
break;
if (ret) {
relay_file_read_consume(buf, read_start, ret);
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
- } while (desc.count && ret);
+ } while (desc->count && ret);
mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
- return desc.written;
+ return desc->written;
}
static ssize_t relay_file_read(struct file *filp,
@@ -986,8 +987,13 @@ static ssize_t relay_file_read(struct file *filp,
size_t count,
loff_t *ppos)
{
- return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
- NULL, buffer);
+ read_descriptor_t desc;
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.buf = buffer;
+ desc.error = 0;
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
+ NULL, &desc);
}
static ssize_t relay_file_sendfile(struct file *filp,
@@ -996,8 +1002,13 @@ static ssize_t relay_file_sendfile(struct file *filp,
read_actor_t actor,
void *target)
{
- return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
- actor, target);
+ read_descriptor_t desc;
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.data = target;
+ desc.error = 0;
+ return relay_file_read_subbufs(filp, ppos, subbuf_send_actor,
+ actor, &desc);
}
struct file_operations relay_file_operations = {
diff --git a/kernel/resource.c b/kernel/resource.c
index 46286434af80..6de60c12143e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -193,6 +193,13 @@ static int __release_resource(struct resource *old)
return -EINVAL;
}
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
int request_resource(struct resource *root, struct resource *new)
{
struct resource *conflict;
@@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new)
EXPORT_SYMBOL(request_resource);
+/**
+ * ____request_resource - reserve a resource, with resource conflict returned
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns:
+ * On success, NULL is returned.
+ * On error, a pointer to the conflicting resource is returned.
+ */
struct resource *____request_resource(struct resource *root, struct resource *new)
{
struct resource *conflict;
@@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne
EXPORT_SYMBOL(____request_resource);
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
int release_resource(struct resource *old)
{
int retval;
@@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new,
return -EBUSY;
}
-/*
- * Allocate empty slot in the resource tree given range and alignment.
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum size to allocate
+ * @max: maximum size to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
*/
int allocate_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
@@ -344,12 +372,11 @@ EXPORT_SYMBOL(allocate_resource);
*
* Returns 0 on success, -EBUSY if the resource can't be inserted.
*
- * This function is equivalent of request_resource when no conflict
+ * This function is equivalent to request_resource when no conflict
* happens. If a conflict happens, and the conflicting resources
* entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become childs of
- * the new resource. Otherwise the new resource becomes the child of
- * the conflicting resource
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
*/
int insert_resource(struct resource *parent, struct resource *new)
{
@@ -357,20 +384,21 @@ int insert_resource(struct resource *parent, struct resource *new)
struct resource *first, *next;
write_lock(&resource_lock);
- begin:
- result = 0;
- first = __request_resource(parent, new);
- if (!first)
- goto out;
- result = -EBUSY;
- if (first == parent)
- goto out;
+ for (;; parent = first) {
+ result = 0;
+ first = __request_resource(parent, new);
+ if (!first)
+ goto out;
- /* Resource fully contained by the clashing resource? Recurse into it */
- if (first->start <= new->start && first->end >= new->end) {
- parent = first;
- goto begin;
+ result = -EBUSY;
+ if (first == parent)
+ goto out;
+
+ if ((first->start > new->start) || (first->end < new->end))
+ break;
+ if ((first->start == new->start) && (first->end == new->end))
+ break;
}
for (next = first; ; next = next->sibling) {
@@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new)
return result;
}
-/*
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
* Given an existing resource, change its start and size to match the
- * arguments. Returns -EBUSY if it can't fit. Existing children of
- * the resource are assumed to be immutable.
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
{
@@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource);
* Note how this, unlike the above, knows about
* the IO flag meanings (busy etc).
*
- * Request-region creates a new busy region.
+ * request_region creates a new busy region.
*
- * Check-region returns non-zero if the area is already busy
+ * check_region returns non-zero if the area is already busy.
*
- * Release-region releases a matching busy region.
+ * release_region releases a matching busy region.
+ */
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
*/
struct resource * __request_region(struct resource *parent,
resource_size_t start, resource_size_t n,
@@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent,
}
return res;
}
-
EXPORT_SYMBOL(__request_region);
+/**
+ * __check_region - check if a resource region is busy or free
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * Returns 0 if the region is free at the moment it is checked,
+ * returns %-EBUSY if the region is busy.
+ *
+ * NOTE:
+ * This function is deprecated because its use is racy.
+ * Even if it returns 0, a subsequent call to request_region()
+ * may fail because another driver etc. just allocated the region.
+ * Do NOT use it. It will be removed from the kernel.
+ */
int __check_region(struct resource *parent, resource_size_t start,
resource_size_t n)
{
@@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start,
kfree(res);
return 0;
}
-
EXPORT_SYMBOL(__check_region);
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
void __release_region(struct resource *parent, resource_size_t start,
resource_size_t n)
{
@@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start,
"<%016llx-%016llx>\n", (unsigned long long)start,
(unsigned long long)end);
}
-
EXPORT_SYMBOL(__release_region);
/*
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 0c1faa950af7..da8d6bf46457 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -16,7 +16,6 @@
*
* See rt.c in preempt-rt for proper credits and further information
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/module.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 948bd8f643e2..6dcea9dd8c94 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,7 +6,6 @@
* Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
*/
-#include <linux/config.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/sched.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856f..4ab17da46fd8 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* Grab the next task */
task = rt_mutex_owner(lock);
+ get_task_struct(task);
spin_lock_irqsave(&task->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
__rt_mutex_adjust_prio(task);
}
- get_task_struct(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
unsigned long flags;
- int boost = 0, res;
+ int chain_walk = 0, res;
spin_lock_irqsave(&current->pi_lock, flags);
__rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
- spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
- else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
- spin_lock_irqsave(&owner->pi_lock, flags);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
spin_unlock_irqrestore(&owner->pi_lock, flags);
}
- if (!boost)
+ else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ chain_walk = 1;
+
+ if (!chain_walk)
return 0;
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
unsigned long flags;
- int boost = 0;
+ int chain_walk = 0;
spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
}
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on) {
- boost = 1;
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(owner);
- }
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+
spin_unlock_irqrestore(&owner->pi_lock, flags);
}
WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
- if (!boost)
+ if (!chain_walk)
return;
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
spin_unlock(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
return;
}
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c848fd4e461..3399701c680e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <asm/tlb.h>
@@ -160,15 +160,6 @@
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
-/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- */
-
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+/*
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+
static inline unsigned int task_timeslice(struct task_struct *p)
{
return static_prio_timeslice(p->static_prio);
@@ -1232,7 +1232,7 @@ nextgroup:
}
/*
- * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
while (sd) {
cpumask_t span;
struct sched_group *group;
- int new_cpu;
- int weight;
+ int new_cpu, weight;
+
+ if (!(sd->flags & flag)) {
+ sd = sd->child;
+ continue;
+ }
span = sd->span;
group = find_idlest_group(sd, t, cpu);
- if (!group)
- goto nextlevel;
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
new_cpu = find_idlest_cpu(group, t, cpu);
- if (new_cpu == -1 || new_cpu == cpu)
- goto nextlevel;
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
- /* Now try balancing at a lower domain level */
+ /* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
-nextlevel:
sd = NULL;
weight = cpus_weight(span);
for_each_domain(cpu, tmp) {
@@ -1755,27 +1763,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
- unsigned long prev_task_flags;
+ long prev_state;
rq->prev_mm = NULL;
/*
* A task struct has one reference for the use as "current".
- * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
- * calls schedule one last time. The schedule call will never return,
- * and the scheduled task must drop that reference.
- * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
* still held, otherwise prev could be scheduled on another cpu, die
* there before we look at prev->state, and then the reference would
* be dropped twice.
* Manfred Spraul <manfred@colorfullife.com>
*/
- prev_task_flags = prev->flags;
+ prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
- if (unlikely(prev_task_flags & PF_DEAD)) {
+ if (unlikely(prev_state == TASK_DEAD)) {
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -1814,14 +1822,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
- if (unlikely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (unlikely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
@@ -2533,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
cpumask_t cpus = CPU_MASK_ALL;
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as IDLE, instead of
+ * portraying it as NOT_IDLE.
+ */
if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
@@ -2630,7 +2644,7 @@ redo:
}
if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
return nr_moved;
@@ -2646,7 +2660,7 @@ out_one_pinned:
sd->balance_interval *= 2;
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
return 0;
}
@@ -2668,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
int sd_idle = 0;
cpumask_t cpus = CPU_MASK_ALL;
- if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as IDLE, instead of
+ * portraying it as NOT_IDLE.
+ */
+ if (sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2709,7 +2730,8 @@ redo:
if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
} else
sd->nr_balance_failed = 0;
@@ -2719,7 +2741,7 @@ redo:
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
sd->nr_balance_failed = 0;
@@ -3348,9 +3370,6 @@ need_resched_nonpreemptible:
spin_lock_irq(&rq->lock);
- if (unlikely(prev->flags & PF_DEAD))
- prev->state = EXIT_DEAD;
-
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
@@ -3472,7 +3491,7 @@ asmlinkage void __sched preempt_schedule(void)
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (unlikely(ti->preempt_count || irqs_disabled()))
+ if (likely(ti->preempt_count || irqs_disabled()))
return;
need_resched:
@@ -4080,6 +4099,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
*/
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
@@ -4107,28 +4128,32 @@ recheck:
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
- != (param->sched_priority == 0))
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (!capable(CAP_SYS_NICE)) {
- /*
- * can't change policy, except between SCHED_NORMAL
- * and SCHED_BATCH:
- */
- if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
- (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
- !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
- /* can't increase priority */
- if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
- param->sched_priority > p->rt_priority &&
- param->sched_priority >
- p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return -ESRCH;
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ unlock_task_sighand(p, &flags);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
/* can't change other user's priorities */
if ((current->euid != p->euid) &&
(current->euid != p->uid))
@@ -4193,14 +4218,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- read_lock_irq(&tasklist_lock);
+
+ rcu_read_lock();
+ retval = -ESRCH;
p = find_process_by_pid(pid);
- if (!p) {
- read_unlock_irq(&tasklist_lock);
- return -ESRCH;
- }
- retval = sched_setscheduler(p, policy, &lparam);
- read_unlock_irq(&tasklist_lock);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
return retval;
}
@@ -4382,7 +4406,10 @@ EXPORT_SYMBOL(cpu_present_map);
#ifndef CONFIG_SMP
cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
#endif
long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4812,7 +4839,7 @@ void show_state(void)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -5151,7 +5178,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
/* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->flags & PF_DEAD);
+ BUG_ON(p->state == TASK_DEAD);
get_task_struct(p);
@@ -5272,9 +5299,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
+ int err;
/* Start one for the boot CPU: */
- migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
@@ -5385,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
if (sd->flags & (SD_LOAD_BALANCE |
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
- SD_BALANCE_EXEC)) {
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5419,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
pflags &= ~(SD_LOAD_BALANCE |
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
- SD_BALANCE_EXEC);
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES);
}
if (~cflags & pflags)
return 0;
@@ -5441,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
- if (sd_parent_degenerate(tmp, parent))
+ if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ }
}
- if (sd && sd_degenerate(sd))
+ if (sd && sd_degenerate(sd)) {
sd = sd->parent;
+ if (sd)
+ sd->child = NULL;
+ }
sched_domain_debug(sd, cpu);
@@ -5454,7 +5493,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
}
/* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
@@ -5482,15 +5521,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
- int (*group_fn)(int cpu))
+static void
+init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+ const cpumask_t *cpu_map,
+ int (*group_fn)(int cpu, const cpumask_t *cpu_map))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
int i;
for_each_cpu_mask(i, span) {
- int group = group_fn(i);
+ int group = group_fn(i, cpu_map);
struct sched_group *sg = &groups[group];
int j;
@@ -5501,7 +5542,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
sg->cpu_power = 0;
for_each_cpu_mask(j, span) {
- if (group_fn(j) != group)
+ if (group_fn(j, cpu_map) != group)
continue;
cpu_set(j, covered);
@@ -5968,13 +6009,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
#endif
);
if (system_state == SYSTEM_BOOTING) {
- printk("migration_cost=");
- for (distance = 0; distance <= max_distance; distance++) {
- if (distance)
- printk(",");
- printk("%ld", (long)migration_cost[distance] / 1000);
+ if (num_online_cpus() > 1) {
+ printk("migration_cost=");
+ for (distance = 0; distance <= max_distance; distance++) {
+ if (distance)
+ printk(",");
+ printk("%ld", (long)migration_cost[distance] / 1000);
+ }
+ printk("\n");
}
- printk("\n");
}
j1 = jiffies;
if (migration_debug)
@@ -6077,7 +6120,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
{
return cpu;
}
@@ -6088,31 +6131,36 @@ static int cpu_to_cpu_group(int cpu)
*/
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+static struct sched_group sched_group_core[NR_CPUS];
#endif
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
{
- return first_cpu(cpu_sibling_map[cpu]);
+ cpumask_t mask = cpu_sibling_map[cpu];
+ cpus_and(mask, mask, *cpu_map);
+ return first_cpu(mask);
}
#elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
{
return cpu;
}
#endif
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
{
#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
+ cpus_and(mask, mask, *cpu_map);
return first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
- return first_cpu(cpu_sibling_map[cpu]);
+ cpumask_t mask = cpu_sibling_map[cpu];
+ cpus_and(mask, mask, *cpu_map);
+ return first_cpu(mask);
#else
return cpu;
#endif
@@ -6130,7 +6178,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-static int cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
{
return cpu_to_node(cpu);
}
@@ -6162,12 +6210,11 @@ next_sg:
}
#endif
+#ifdef CONFIG_NUMA
/* Free memory allocated for various sched_group structures */
static void free_sched_groups(const cpumask_t *cpu_map)
{
- int cpu;
-#ifdef CONFIG_NUMA
- int i;
+ int cpu, i;
for_each_cpu_mask(cpu, *cpu_map) {
struct sched_group *sched_group_allnodes
@@ -6204,19 +6251,63 @@ next_sg:
kfree(sched_group_nodes);
sched_group_nodes_bycpu[cpu] = NULL;
}
+}
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
#endif
- for_each_cpu_mask(cpu, *cpu_map) {
- if (sched_group_phys_bycpu[cpu]) {
- kfree(sched_group_phys_bycpu[cpu]);
- sched_group_phys_bycpu[cpu] = NULL;
- }
-#ifdef CONFIG_SCHED_MC
- if (sched_group_core_bycpu[cpu]) {
- kfree(sched_group_core_bycpu[cpu]);
- sched_group_core_bycpu[cpu] = NULL;
- }
-#endif
+
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+ struct sched_domain *child;
+ struct sched_group *group;
+
+ WARN_ON(!sd || !sd->groups);
+
+ if (cpu != first_cpu(sd->groups->cpumask))
+ return;
+
+ child = sd->child;
+
+ /*
+ * For perf policy, if the groups in child domain share resources
+ * (for example cores sharing some portions of the cache hierarchy
+ * or SMT), then set this domain groups cpu_power such that each group
+ * can handle only one task, when there are other idle groups in the
+ * same sched domain.
+ */
+ if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+ (child->flags &
+ (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ return;
}
+
+ sd->groups->cpu_power = 0;
+
+ /*
+ * add cpu_power of each child group to this groups cpu_power
+ */
+ group = child->groups;
+ do {
+ sd->groups->cpu_power += group->cpu_power;
+ group = group->next;
+ } while (group != child->groups);
}
/*
@@ -6226,10 +6317,7 @@ next_sg:
static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
- struct sched_group *sched_group_phys = NULL;
-#ifdef CONFIG_SCHED_MC
- struct sched_group *sched_group_core = NULL;
-#endif
+ struct sched_domain *sd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
struct sched_group *sched_group_allnodes = NULL;
@@ -6261,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
if (!sched_group_allnodes) {
sched_group_allnodes
- = kmalloc(sizeof(struct sched_group)
- * MAX_NUMNODES,
- GFP_KERNEL);
+ = kmalloc_node(sizeof(struct sched_group)
+ * MAX_NUMNODES,
+ GFP_KERNEL,
+ cpu_to_node(i));
if (!sched_group_allnodes) {
printk(KERN_WARNING
"Can not alloc allnodes sched group\n");
@@ -6275,7 +6364,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(allnodes_domains, i);
*sd = SD_ALLNODES_INIT;
sd->span = *cpu_map;
- group = cpu_to_allnodes_group(i);
+ group = cpu_to_allnodes_group(i, cpu_map);
sd->groups = &sched_group_allnodes[group];
p = sd;
} else
@@ -6285,60 +6374,42 @@ static int build_sched_domains(const cpumask_t *cpu_map)
*sd = SD_NODE_INIT;
sd->span = sched_domain_node_span(cpu_to_node(i));
sd->parent = p;
+ if (p)
+ p->child = sd;
cpus_and(sd->span, sd->span, *cpu_map);
#endif
- if (!sched_group_phys) {
- sched_group_phys
- = kmalloc(sizeof(struct sched_group) * NR_CPUS,
- GFP_KERNEL);
- if (!sched_group_phys) {
- printk (KERN_WARNING "Can not alloc phys sched"
- "group\n");
- goto error;
- }
- sched_group_phys_bycpu[i] = sched_group_phys;
- }
-
p = sd;
sd = &per_cpu(phys_domains, i);
- group = cpu_to_phys_group(i);
+ group = cpu_to_phys_group(i, cpu_map);
*sd = SD_CPU_INIT;
sd->span = nodemask;
sd->parent = p;
+ if (p)
+ p->child = sd;
sd->groups = &sched_group_phys[group];
#ifdef CONFIG_SCHED_MC
- if (!sched_group_core) {
- sched_group_core
- = kmalloc(sizeof(struct sched_group) * NR_CPUS,
- GFP_KERNEL);
- if (!sched_group_core) {
- printk (KERN_WARNING "Can not alloc core sched"
- "group\n");
- goto error;
- }
- sched_group_core_bycpu[i] = sched_group_core;
- }
-
p = sd;
sd = &per_cpu(core_domains, i);
- group = cpu_to_core_group(i);
+ group = cpu_to_core_group(i, cpu_map);
*sd = SD_MC_INIT;
sd->span = cpu_coregroup_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
+ p->child = sd;
sd->groups = &sched_group_core[group];
#endif
#ifdef CONFIG_SCHED_SMT
p = sd;
sd = &per_cpu(cpu_domains, i);
- group = cpu_to_cpu_group(i);
+ group = cpu_to_cpu_group(i, cpu_map);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
+ p->child = sd;
sd->groups = &sched_group_cpus[group];
#endif
}
@@ -6352,7 +6423,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
continue;
init_sched_build_groups(sched_group_cpus, this_sibling_map,
- &cpu_to_cpu_group);
+ cpu_map, &cpu_to_cpu_group);
}
#endif
@@ -6364,7 +6435,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
if (i != first_cpu(this_core_map))
continue;
init_sched_build_groups(sched_group_core, this_core_map,
- &cpu_to_core_group);
+ cpu_map, &cpu_to_core_group);
}
#endif
@@ -6378,14 +6449,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
continue;
init_sched_build_groups(sched_group_phys, nodemask,
- &cpu_to_phys_group);
+ cpu_map, &cpu_to_phys_group);
}
#ifdef CONFIG_NUMA
/* Set up node groups */
if (sched_group_allnodes)
init_sched_build_groups(sched_group_allnodes, *cpu_map,
- &cpu_to_allnodes_group);
+ cpu_map, &cpu_to_allnodes_group);
for (i = 0; i < MAX_NUMNODES; i++) {
/* Set up node groups */
@@ -6457,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- struct sched_domain *sd;
sd = &per_cpu(cpu_domains, i);
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
for_each_cpu_mask(i, *cpu_map) {
- int power;
- struct sched_domain *sd;
sd = &per_cpu(core_domains, i);
- if (sched_smt_power_savings)
- power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
- else
- power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
- * SCHED_LOAD_SCALE / 10;
- sd->groups->cpu_power = power;
+ init_sched_groups_power(i, sd);
}
#endif
for_each_cpu_mask(i, *cpu_map) {
- struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(phys_domains, i);
- if (i != first_cpu(sd->groups->cpumask))
- continue;
-
- sd->groups->cpu_power = 0;
- if (sched_mc_power_savings || sched_smt_power_savings) {
- int j;
-
- for_each_cpu_mask(j, sd->groups->cpumask) {
- struct sched_domain *sd1;
- sd1 = &per_cpu(core_domains, j);
- /*
- * for each core we will add once
- * to the group in physical domain
- */
- if (j != first_cpu(sd1->groups->cpumask))
- continue;
-
- if (sched_smt_power_savings)
- sd->groups->cpu_power += sd1->groups->cpu_power;
- else
- sd->groups->cpu_power += SCHED_LOAD_SCALE;
- }
- } else
- /*
- * This has to be < 2 * SCHED_LOAD_SCALE
- * Lets keep it SCHED_LOAD_SCALE, so that
- * while calculating NUMA group's cpu_power
- * we can simply do
- * numa_group->cpu_power += phys_group->cpu_power;
- *
- * See "only add power once for each physical pkg"
- * comment below
- */
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
- int power;
sd = &per_cpu(phys_domains, i);
- if (sched_smt_power_savings)
- power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
- else
- power = SCHED_LOAD_SCALE;
- sd->groups->cpu_power = power;
-#endif
+ init_sched_groups_power(i, sd);
}
#ifdef CONFIG_NUMA
@@ -6530,7 +6549,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
init_numa_sched_groups_power(sched_group_nodes[i]);
if (sched_group_allnodes) {
- int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+ int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
struct sched_group *sg = &sched_group_allnodes[group];
init_numa_sched_groups_power(sg);
@@ -6556,9 +6575,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
return 0;
+#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map);
return -ENOMEM;
+#endif
}
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
@@ -6740,11 +6761,20 @@ static int update_sched_domains(struct notifier_block *nfb,
void __init sched_init_smp(void)
{
+ cpumask_t non_isolated_cpus;
+
lock_cpu_hotplug();
arch_init_sched_domains(&cpu_online_map);
+ cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+ if (cpus_empty(non_isolated_cpus))
+ cpu_set(smp_processor_id(), non_isolated_cpus);
unlock_cpu_hotplug();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+ BUG();
}
#else
void __init sched_init_smp(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index bfdb5686fa3e..7ed8d5304bec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
siginfo_t *info)
{
- int sig = 0;
+ int sig = next_signal(pending, mask);
- sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
if (!collect_signal(sig, pending, info))
sig = 0;
-
}
- recalc_sigpending();
return sig;
}
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
if (!signr)
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
+ recalc_sigpending_tsk(tsk);
if (signr && unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
@@ -1057,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
}
/*
- * kill_pg_info() sends a signal to a process group: this is what the tty
+ * kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
*/
-int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
int retval, success;
- if (pgrp <= 0)
- return -EINVAL;
-
success = 0;
retval = -ESRCH;
- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p);
success |= !err;
retval = err;
- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return success ? 0 : retval;
}
+int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = __kill_pgrp_info(sig, info, pgrp);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+ if (pgrp <= 0)
+ return -EINVAL;
+
+ return __kill_pgrp_info(sig, info, find_pid(pgrp));
+}
+
int
kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
{
@@ -1091,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
return retval;
}
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
{
int error;
int acquired_tasklist_lock = 0;
@@ -1103,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
read_lock(&tasklist_lock);
acquired_tasklist_lock = 1;
}
- p = find_task_by_pid(pid);
+ p = pid_task(pid, PIDTYPE_PID);
error = -ESRCH;
if (p)
error = group_send_sig_info(sig, info, p);
@@ -1113,8 +1126,18 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
return error;
}
-/* like kill_proc_info(), but doesn't use uid/euid of "current" */
-int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int error;
+ rcu_read_lock();
+ error = kill_pid_info(sig, info, find_pid(pid));
+ rcu_read_unlock();
+ return error;
+}
+
+/* like kill_pid_info(), but doesn't use uid/euid of "current" */
+int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
uid_t uid, uid_t euid, u32 secid)
{
int ret = -EINVAL;
@@ -1124,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
return ret;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
+ p = pid_task(pid, PIDTYPE_PID);
if (!p) {
ret = -ESRCH;
goto out_unlock;
@@ -1148,7 +1171,7 @@ out_unlock:
read_unlock(&tasklist_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1266,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p)
return 0;
}
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ return kill_pgrp_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pgrp);
+
+int kill_pid(struct pid *pid, int sig, int priv)
+{
+ return kill_pid_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pid);
+
int
kill_pg(pid_t pgrp, int sig, int priv)
{
@@ -2577,6 +2612,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
}
#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
void __init signals_init(void)
{
sigqueue_cachep =
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197c..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
__init int spawn_ksoftirqd(void)
{
void *cpu = (void *)(long)smp_processor_id();
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b787..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 9644a41e0bef..476c3741511b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,17 +21,6 @@
#include <linux/debug_locks.h>
#include <linux/module.h>
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
- __raw_read_lock(lock);
- return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
-
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
@@ -226,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
- cpu_relax(); \
+ _raw_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
} \
@@ -248,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
- cpu_relax(); \
+ _raw_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
return flags; \
diff --git a/kernel/srcu.c b/kernel/srcu.c
new file mode 100644
index 000000000000..3507cabe963b
--- /dev/null
+++ b/kernel/srcu.c
@@ -0,0 +1,258 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/srcu.h>
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ sp->completed = 0;
+ mutex_init(&sp->mutex);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ return (sp->per_cpu_ref ? 0 : -ENOMEM);
+}
+
+/*
+ * srcu_readers_active_idx -- returns approximate number of readers
+ * active on the specified rank of per-CPU counters.
+ */
+
+static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ int sum;
+
+ sum = 0;
+ for_each_possible_cpu(cpu)
+ sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+ return sum;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+int srcu_readers_active(struct srcu_struct *sp)
+{
+ return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ int sum;
+
+ sum = srcu_readers_active(sp);
+ WARN_ON(sum); /* Leakage unless caller handles error. */
+ if (sum != 0)
+ return;
+ free_percpu(sp->per_cpu_ref);
+ sp->per_cpu_ref = NULL;
+}
+
+/**
+ * srcu_read_lock - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ preempt_disable();
+ idx = sp->completed & 0x1;
+ barrier(); /* ensure compiler looks -once- at sp->completed. */
+ per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+ srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ preempt_enable();
+ return idx;
+}
+
+/**
+ * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ preempt_disable();
+ srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+ preempt_enable();
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates. Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchornize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = sp->completed;
+ mutex_lock(&sp->mutex);
+
+ /*
+ * Check to see if someone else did the work for us while we were
+ * waiting to acquire the lock. We need -two- advances of
+ * the counter, not just one. If there was but one, we might have
+ * shown up -after- our helper's first synchronize_sched(), thus
+ * having failed to prevent CPU-reordering races with concurrent
+ * srcu_read_unlock()s on other CPUs (see comment below). So we
+ * either (1) wait for two or (2) supply the second ourselves.
+ */
+
+ if ((sp->completed - idx) >= 2) {
+ mutex_unlock(&sp->mutex);
+ return;
+ }
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() ensures that any CPU that
+ * sees the new value of sp->completed will also see any preceding
+ * changes to data structures made by this CPU. This prevents
+ * some other CPU from reordering the accesses in its SRCU
+ * read-side critical section to precede the corresponding
+ * srcu_read_lock() -- ensuring that such references will in
+ * fact be protected.
+ *
+ * So it is now safe to do the flip.
+ */
+
+ idx = sp->completed & 0x1;
+ sp->completed++;
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * At this point, because of the preceding synchronize_sched(),
+ * all srcu_read_lock() calls using the old counters have completed.
+ * Their corresponding critical sections might well be still
+ * executing, but the srcu_read_lock() primitives themselves
+ * will have finished executing.
+ */
+
+ while (srcu_readers_active_idx(sp, idx))
+ schedule_timeout_interruptible(1);
+
+ synchronize_sched(); /* Force memory barrier on all CPUs. */
+
+ /*
+ * The preceding synchronize_sched() forces all srcu_read_unlock()
+ * primitives that were executing concurrently with the preceding
+ * for_each_possible_cpu() loop to have completed by this point.
+ * More importantly, it also forces the corresponding SRCU read-side
+ * critical sections to have also completed, and the corresponding
+ * references to SRCU-protected data items to be dropped.
+ *
+ * Note:
+ *
+ * Despite what you might think at first glance, the
+ * preceding synchronize_sched() -must- be within the
+ * critical section ended by the following mutex_unlock().
+ * Otherwise, a task taking the early exit can race
+ * with a srcu_read_unlock(), which might have executed
+ * just before the preceding srcu_readers_active() check,
+ * and whose CPU might have reordered the srcu_read_unlock()
+ * with the preceding critical section. In this case, there
+ * is nothing preventing the synchronize_sched() task that is
+ * taking the early exit from freeing a data structure that
+ * is still being referenced (out of order) by the task
+ * doing the srcu_read_unlock().
+ *
+ * Alternatively, the comparison with "2" on the early exit
+ * could be changed to "3", but this increases synchronize_srcu()
+ * latency for bulk loads. So the current code is preferred.
+ */
+
+ mutex_unlock(&sp->mutex);
+}
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+
+long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->completed;
+}
+
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+EXPORT_SYMBOL_GPL(srcu_readers_active);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111dbd..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#include <linux/sched.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 3f894775488d..98489d82801b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid);
*/
int C_A_D = 1;
-int cad_pid = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
/*
* Notifier list for kernel code which wants to be called
@@ -152,7 +153,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
/*
* Atomic notifier chain routines. Registration and unregistration
- * use a mutex, and call_chain is synchronized by RCU (no locks).
+ * use a spinlock, and call_chain is synchronized by RCU (no locks).
*/
/**
@@ -221,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* of the last notifier function called.
*/
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v)
{
int ret;
@@ -400,6 +401,129 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+/*
+ * SRCU notifier chain routines. Registration and unregistration
+ * use a mutex, and call_chain is synchronized by SRCU (no locks).
+ */
+
+/**
+ * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an SRCU notifier chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+
+int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_register(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
+
+/**
+ * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an SRCU notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_unregister(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ synchronize_srcu(&nh->srcu);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
+
+/**
+ * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ int ret;
+ int idx;
+
+ idx = srcu_read_lock(&nh->srcu);
+ ret = notifier_call_chain(&nh->head, val, v);
+ srcu_read_unlock(&nh->srcu, idx);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
+
+/**
+ * srcu_init_notifier_head - Initialize an SRCU notifier head
+ * @nh: Pointer to head of the srcu notifier chain
+ *
+ * Unlike other sorts of notifier heads, SRCU notifier heads require
+ * dynamic initialization. Be sure to call this routine before
+ * calling any of the other SRCU notifier routines for this head.
+ *
+ * If an SRCU notifier head is deallocated, it must first be cleaned
+ * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
+ * per-cpu data (used by the SRCU mechanism) will leak.
+ */
+
+void srcu_init_notifier_head(struct srcu_notifier_head *nh)
+{
+ mutex_init(&nh->mutex);
+ if (init_srcu_struct(&nh->srcu) < 0)
+ BUG();
+ nh->head = NULL;
+}
+
+EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+
/**
* register_reboot_notifier - Register function to be called at reboot time
* @nb: Info about notifier function to be called
@@ -607,12 +731,10 @@ static void kernel_restart_prepare(char *cmd)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (!cmd) {
+ if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
- } else {
+ else
printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
- }
- printk(".\n");
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -628,9 +750,8 @@ static void kernel_kexec(void)
#ifdef CONFIG_KEXEC
struct kimage *image;
image = xchg(&kexec_image, NULL);
- if (!image) {
+ if (!image)
return;
- }
kernel_restart_prepare(NULL);
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
@@ -776,10 +897,9 @@ void ctrl_alt_del(void)
if (C_A_D)
schedule_work(&cad_work);
else
- kill_proc(cad_pid, SIGINT, 1);
+ kill_cad_pid(SIGINT, 1);
}
-
/*
* Unprivileged users may change the real gid to the effective gid
* or vice versa. (BSD-style)
@@ -824,12 +944,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
(current->sgid == egid) ||
capable(CAP_SETGID))
new_egid = egid;
- else {
+ else
return -EPERM;
- }
}
- if (new_egid != old_egid)
- {
+ if (new_egid != old_egid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -858,19 +976,14 @@ asmlinkage long sys_setgid(gid_t gid)
if (retval)
return retval;
- if (capable(CAP_SETGID))
- {
- if(old_egid != gid)
- {
+ if (capable(CAP_SETGID)) {
+ if (old_egid != gid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->gid = current->egid = current->sgid = current->fsgid = gid;
- }
- else if ((gid == current->gid) || (gid == current->sgid))
- {
- if(old_egid != gid)
- {
+ } else if ((gid == current->gid) || (gid == current->sgid)) {
+ if (old_egid != gid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -901,8 +1014,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
switch_uid(new_user);
- if(dumpclear)
- {
+ if (dumpclear) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -958,8 +1070,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
return -EAGAIN;
- if (new_euid != old_euid)
- {
+ if (new_euid != old_euid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1009,8 +1120,7 @@ asmlinkage long sys_setuid(uid_t uid)
} else if ((uid != current->uid) && (uid != new_suid))
return -EPERM;
- if (old_euid != uid)
- {
+ if (old_euid != uid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1055,8 +1165,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
return -EAGAIN;
}
if (euid != (uid_t) -1) {
- if (euid != current->euid)
- {
+ if (euid != current->euid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1106,8 +1215,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
return -EPERM;
}
if (egid != (gid_t) -1) {
- if (egid != current->egid)
- {
+ if (egid != current->egid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1152,10 +1260,8 @@ asmlinkage long sys_setfsuid(uid_t uid)
if (uid == current->uid || uid == current->euid ||
uid == current->suid || uid == current->fsuid ||
- capable(CAP_SETUID))
- {
- if (uid != old_fsuid)
- {
+ capable(CAP_SETUID)) {
+ if (uid != old_fsuid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1183,10 +1289,8 @@ asmlinkage long sys_setfsgid(gid_t gid)
if (gid == current->gid || gid == current->egid ||
gid == current->sgid || gid == current->fsgid ||
- capable(CAP_SETGID))
- {
- if (gid != old_fsgid)
- {
+ capable(CAP_SETGID)) {
+ if (gid != old_fsgid) {
current->mm->dumpable = suid_dumpable;
smp_wmb();
}
@@ -1322,9 +1426,9 @@ out:
asmlinkage long sys_getpgid(pid_t pid)
{
- if (!pid) {
+ if (!pid)
return process_group(current);
- } else {
+ else {
int retval;
struct task_struct *p;
@@ -1354,9 +1458,9 @@ asmlinkage long sys_getpgrp(void)
asmlinkage long sys_getsid(pid_t pid)
{
- if (!pid) {
+ if (!pid)
return current->signal->session;
- } else {
+ else {
int retval;
struct task_struct *p;
@@ -1364,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
p = find_task_by_pid(pid);
retval = -ESRCH;
- if(p) {
+ if (p) {
retval = security_task_getsid(p);
if (!retval)
retval = p->signal->session;
@@ -1432,9 +1536,9 @@ struct group_info *groups_alloc(int gidsetsize)
group_info->nblocks = nblocks;
atomic_set(&group_info->usage, 1);
- if (gidsetsize <= NGROUPS_SMALL) {
+ if (gidsetsize <= NGROUPS_SMALL)
group_info->blocks[0] = group_info->small_block;
- } else {
+ else {
for (i = 0; i < nblocks; i++) {
gid_t *b;
b = (void *)__get_free_page(GFP_USER);
@@ -1490,7 +1594,7 @@ static int groups_to_user(gid_t __user *grouplist,
/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
- {
+{
int i;
int count = group_info->ngroups;
@@ -1648,9 +1752,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
int in_group_p(gid_t grp)
{
int retval = 1;
- if (grp != current->fsgid) {
+ if (grp != current->fsgid)
retval = groups_search(current->group_info, grp);
- }
return retval;
}
@@ -1659,9 +1762,8 @@ EXPORT_SYMBOL(in_group_p);
int in_egroup_p(gid_t grp)
{
int retval = 1;
- if (grp != current->egid) {
+ if (grp != current->egid)
retval = groups_search(current->group_info, grp);
- }
return retval;
}
@@ -1676,7 +1778,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
int errno = 0;
down_read(&uts_sem);
- if (copy_to_user(name,&system_utsname,sizeof *name))
+ if (copy_to_user(name, utsname(), sizeof *name))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
@@ -1694,8 +1796,8 @@ asmlinkage long sys_sethostname(char __user *name, int len)
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(system_utsname.nodename, tmp, len);
- system_utsname.nodename[len] = 0;
+ memcpy(utsname()->nodename, tmp, len);
+ utsname()->nodename[len] = 0;
errno = 0;
}
up_write(&uts_sem);
@@ -1711,11 +1813,11 @@ asmlinkage long sys_gethostname(char __user *name, int len)
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
- i = 1 + strlen(system_utsname.nodename);
+ i = 1 + strlen(utsname()->nodename);
if (i > len)
i = len;
errno = 0;
- if (copy_to_user(name, system_utsname.nodename, i))
+ if (copy_to_user(name, utsname()->nodename, i))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
@@ -1740,8 +1842,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(system_utsname.domainname, tmp, len);
- system_utsname.domainname[len] = 0;
+ memcpy(utsname()->domainname, tmp, len);
+ utsname()->domainname[len] = 0;
errno = 0;
}
up_write(&uts_sem);
@@ -1776,9 +1878,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
task_lock(current->group_leader);
x = current->signal->rlim[resource];
task_unlock(current->group_leader);
- if(x.rlim_cur > 0x7FFFFFFF)
+ if (x.rlim_cur > 0x7FFFFFFF)
x.rlim_cur = 0x7FFFFFFF;
- if(x.rlim_max > 0x7FFFFFFF)
+ if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
}
@@ -2084,12 +2186,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
* padding
*/
unsigned long t0, t1;
- get_user(t0, &cache->t0);
- get_user(t1, &cache->t1);
+ get_user(t0, &cache->blob[0]);
+ get_user(t1, &cache->blob[1]);
t0++;
t1++;
- put_user(t0, &cache->t0);
- put_user(t1, &cache->t1);
+ put_user(t0, &cache->blob[0]);
+ put_user(t1, &cache->blob[1]);
}
return err ? -EFAULT : 0;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67e8..0e53314b14de 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list);
cond_syscall(sys_epoll_create);
cond_syscall(sys_epoll_ctl);
cond_syscall(sys_epoll_wait);
+cond_syscall(sys_epoll_pwait);
cond_syscall(sys_semget);
cond_syscall(sys_semop);
cond_syscall(sys_semtimedop);
@@ -134,3 +135,8 @@ cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
cond_syscall(sys_remap_file_pages);
cond_syscall(compat_sys_move_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bcb3a181dbb2..8bff2c18fb5a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
+
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
@@ -64,7 +68,6 @@ extern int sysrq_enabled;
extern int core_uses_pid;
extern int suid_dumpable;
extern char core_pattern[];
-extern int cad_pid;
extern int pid_max;
extern int min_free_kbytes;
extern int printk_ratelimit_jiffies;
@@ -74,13 +77,6 @@ extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
- void __user *, size_t *, loff_t *);
-#endif
-
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
@@ -95,13 +91,8 @@ extern char modprobe_path[];
extern int sg_big_buff;
#endif
#ifdef CONFIG_SYSVIPC
-extern size_t shm_ctlmax;
-extern size_t shm_ctlall;
-extern int shm_ctlmni;
-extern int msg_ctlmax;
-extern int msg_ctlmnb;
-extern int msg_ctlmni;
-extern int sem_ctls[];
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
#ifdef __sparc__
@@ -137,11 +128,19 @@ extern int no_unaligned_warning;
extern int max_lock_depth;
#endif
-static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
- ctl_table *, void **);
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+#ifdef CONFIG_SYSCTL_SYSCALL
+static int parse_table(int __user *, int, void __user *, size_t __user *,
+ void __user *, size_t, ctl_table *, void **);
+#endif
+
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
static ctl_table root_table[];
static struct ctl_table_header root_table_header =
{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
@@ -165,7 +164,7 @@ int sysctl_legacy_va_layout;
/* /proc declarations: */
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -229,51 +228,100 @@ static ctl_table root_table[] = {
};
static ctl_table kern_table[] = {
+#ifndef CONFIG_UTS_NS
+ {
+ .ctl_name = KERN_OSTYPE,
+ .procname = "ostype",
+ .data = init_uts_ns.name.sysname,
+ .maxlen = sizeof(init_uts_ns.name.sysname),
+ .mode = 0444,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = &sysctl_string,
+ },
+ {
+ .ctl_name = KERN_OSRELEASE,
+ .procname = "osrelease",
+ .data = init_uts_ns.name.release,
+ .maxlen = sizeof(init_uts_ns.name.release),
+ .mode = 0444,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = &sysctl_string,
+ },
+ {
+ .ctl_name = KERN_VERSION,
+ .procname = "version",
+ .data = init_uts_ns.name.version,
+ .maxlen = sizeof(init_uts_ns.name.version),
+ .mode = 0444,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = &sysctl_string,
+ },
+ {
+ .ctl_name = KERN_NODENAME,
+ .procname = "hostname",
+ .data = init_uts_ns.name.nodename,
+ .maxlen = sizeof(init_uts_ns.name.nodename),
+ .mode = 0644,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = &sysctl_string,
+ },
+ {
+ .ctl_name = KERN_DOMAINNAME,
+ .procname = "domainname",
+ .data = init_uts_ns.name.domainname,
+ .maxlen = sizeof(init_uts_ns.name.domainname),
+ .mode = 0644,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = &sysctl_string,
+ },
+#else /* !CONFIG_UTS_NS */
{
.ctl_name = KERN_OSTYPE,
.procname = "ostype",
- .data = system_utsname.sysname,
- .maxlen = sizeof(system_utsname.sysname),
+ .data = NULL,
+ /* could maybe use __NEW_UTS_LEN here? */
+ .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
.mode = 0444,
- .proc_handler = &proc_doutsstring,
+ .proc_handler = &proc_do_uts_string,
.strategy = &sysctl_string,
},
{
.ctl_name = KERN_OSRELEASE,
.procname = "osrelease",
- .data = system_utsname.release,
- .maxlen = sizeof(system_utsname.release),
+ .data = NULL,
+ .maxlen = FIELD_SIZEOF(struct new_utsname, release),
.mode = 0444,
- .proc_handler = &proc_doutsstring,
+ .proc_handler = &proc_do_uts_string,
.strategy = &sysctl_string,
},
{
.ctl_name = KERN_VERSION,
.procname = "version",
- .data = system_utsname.version,
- .maxlen = sizeof(system_utsname.version),
+ .data = NULL,
+ .maxlen = FIELD_SIZEOF(struct new_utsname, version),
.mode = 0444,
- .proc_handler = &proc_doutsstring,
+ .proc_handler = &proc_do_uts_string,
.strategy = &sysctl_string,
},
{
.ctl_name = KERN_NODENAME,
.procname = "hostname",
- .data = system_utsname.nodename,
- .maxlen = sizeof(system_utsname.nodename),
+ .data = NULL,
+ .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
.mode = 0644,
- .proc_handler = &proc_doutsstring,
+ .proc_handler = &proc_do_uts_string,
.strategy = &sysctl_string,
},
{
.ctl_name = KERN_DOMAINNAME,
.procname = "domainname",
- .data = system_utsname.domainname,
- .maxlen = sizeof(system_utsname.domainname),
+ .data = NULL,
+ .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
.mode = 0644,
- .proc_handler = &proc_doutsstring,
+ .proc_handler = &proc_do_uts_string,
.strategy = &sysctl_string,
},
+#endif /* !CONFIG_UTS_NS */
{
.ctl_name = KERN_PANIC,
.procname = "panic",
@@ -294,7 +342,7 @@ static ctl_table kern_table[] = {
.ctl_name = KERN_CORE_PATTERN,
.procname = "core_pattern",
.data = core_pattern,
- .maxlen = 64,
+ .maxlen = 128,
.mode = 0644,
.proc_handler = &proc_dostring,
.strategy = &sysctl_string,
@@ -432,58 +480,58 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_SHMMAX,
.procname = "shmmax",
- .data = &shm_ctlmax,
+ .data = NULL,
.maxlen = sizeof (size_t),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_SHMALL,
.procname = "shmall",
- .data = &shm_ctlall,
+ .data = NULL,
.maxlen = sizeof (size_t),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_SHMMNI,
.procname = "shmmni",
- .data = &shm_ctlmni,
+ .data = NULL,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_MSGMAX,
.procname = "msgmax",
- .data = &msg_ctlmax,
+ .data = NULL,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_MSGMNI,
.procname = "msgmni",
- .data = &msg_ctlmni,
+ .data = NULL,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_MSGMNB,
.procname = "msgmnb",
- .data = &msg_ctlmnb,
+ .data = NULL,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_ipc_string,
},
{
.ctl_name = KERN_SEM,
.procname = "sem",
- .data = &sem_ctls,
+ .data = NULL,
.maxlen = 4*sizeof (int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_ipc_string,
},
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -496,14 +544,16 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_PROC_SYSCTL
{
.ctl_name = KERN_CADPID,
.procname = "cad_pid",
- .data = &cad_pid,
+ .data = NULL,
.maxlen = sizeof (int),
.mode = 0600,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_do_cad_pid,
},
+#endif
{
.ctl_name = KERN_MAX_THREADS,
.procname = "threads-max",
@@ -1166,12 +1216,13 @@ static void start_unregistering(struct ctl_table_header *p)
void __init sysctl_init(void)
{
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
register_proc_table(root_table, proc_sys_root, &root_table_header);
init_irq_proc();
#endif
}
+#ifdef CONFIG_SYSCTL_SYSCALL
int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
{
@@ -1225,6 +1276,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
unlock_kernel();
return error;
}
+#endif /* CONFIG_SYSCTL_SYSCALL */
/*
* ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1251,6 +1303,7 @@ static inline int ctl_perm(ctl_table *table, int op)
return test_perm(table->mode, op);
}
+#ifdef CONFIG_SYSCTL_SYSCALL
static int parse_table(int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen,
@@ -1340,6 +1393,7 @@ int do_sysctl_strategy (ctl_table *table,
}
return 0;
}
+#endif /* CONFIG_SYSCTL_SYSCALL */
/**
* register_sysctl_table - register a sysctl hierarchy
@@ -1427,7 +1481,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
else
list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
register_proc_table(table, proc_sys_root, tmp);
#endif
return tmp;
@@ -1445,18 +1499,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
might_sleep();
spin_lock(&sysctl_lock);
start_unregistering(header);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
unregister_proc_table(header->ctl_table, proc_sys_root);
#endif
spin_unlock(&sysctl_lock);
kfree(header);
}
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+ int insert_at_head)
+{
+ return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
/*
* /proc/sys support
*/
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
/* Scan the sysctl entries in table and add them all into /proc */
static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -1607,32 +1674,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf,
return do_rw_proc(1, file, (char __user *) buf, count, ppos);
}
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int _proc_do_string(void* data, int maxlen, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
size_t len;
char __user *p;
char c;
- if (!table->data || !table->maxlen || !*lenp ||
+ if (!data || !maxlen || !*lenp ||
(*ppos && !write)) {
*lenp = 0;
return 0;
@@ -1648,20 +1698,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
break;
len++;
}
- if (len >= table->maxlen)
- len = table->maxlen-1;
- if(copy_from_user(table->data, buffer, len))
+ if (len >= maxlen)
+ len = maxlen-1;
+ if(copy_from_user(data, buffer, len))
return -EFAULT;
- ((char *) table->data)[len] = 0;
+ ((char *) data)[len] = 0;
*ppos += *lenp;
} else {
- len = strlen(table->data);
- if (len > table->maxlen)
- len = table->maxlen;
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
if (len > *lenp)
len = *lenp;
if (len)
- if(copy_to_user(buffer, table->data, len))
+ if(copy_to_user(buffer, data, len))
return -EFAULT;
if (len < *lenp) {
if(put_user('\n', ((char __user *) buffer) + len))
@@ -1674,12 +1724,38 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
return 0;
}
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return _proc_do_string(table->data, table->maxlen, write, filp,
+ buffer, lenp, ppos);
+}
+
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+#ifndef CONFIG_UTS_NS
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int r;
@@ -1695,6 +1771,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
}
return r;
}
+#else /* !CONFIG_UTS_NS */
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int r;
+ struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
+ char* which;
+
+ switch (table->ctl_name) {
+ case KERN_OSTYPE:
+ which = uts_ns->name.sysname;
+ break;
+ case KERN_NODENAME:
+ which = uts_ns->name.nodename;
+ break;
+ case KERN_OSRELEASE:
+ which = uts_ns->name.release;
+ break;
+ case KERN_VERSION:
+ which = uts_ns->name.version;
+ break;
+ case KERN_DOMAINNAME:
+ which = uts_ns->name.domainname;
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (!write) {
+ down_read(&uts_sem);
+ r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
+ up_read(&uts_sem);
+ } else {
+ down_write(&uts_sem);
+ r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
+ up_write(&uts_sem);
+ }
+ out:
+ return r;
+}
+#endif /* !CONFIG_UTS_NS */
static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -1715,8 +1833,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
return 0;
}
-static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos,
+static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
+ int write, struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
int write, void *data),
void *data)
@@ -1729,13 +1848,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
char buf[TMPBUFLEN], *p;
char __user *s = buffer;
- if (!table->data || !table->maxlen || !*lenp ||
+ if (!tbl_data || !table->maxlen || !*lenp ||
(*ppos && !write)) {
*lenp = 0;
return 0;
}
- i = (int *) table->data;
+ i = (int *) tbl_data;
vleft = table->maxlen / sizeof(*i);
left = *lenp;
@@ -1824,6 +1943,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
#undef TMPBUFLEN
}
+static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write, filp,
+ buffer, lenp, ppos, conv, data);
+}
+
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
@@ -1895,7 +2024,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
return -EPERM;
}
- op = (current->pid == 1) ? OP_SET : OP_AND;
+ op = is_init(current) ? OP_SET : OP_AND;
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
do_proc_dointvec_bset_conv,&op);
}
@@ -1957,7 +2086,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
do_proc_dointvec_minmax_conv, &param);
}
-static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
struct file *filp,
void __user *buffer,
size_t *lenp, loff_t *ppos,
@@ -1971,13 +2100,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
char buf[TMPBUFLEN], *p;
char __user *s = buffer;
- if (!table->data || !table->maxlen || !*lenp ||
+ if (!data || !table->maxlen || !*lenp ||
(*ppos && !write)) {
*lenp = 0;
return 0;
}
- i = (unsigned long *) table->data;
+ i = (unsigned long *) data;
min = (unsigned long *) table->extra1;
max = (unsigned long *) table->extra2;
vleft = table->maxlen / sizeof(unsigned long);
@@ -2062,6 +2191,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
#undef TMPBUFLEN
}
+static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ filp, buffer, lenp, ppos, convmul, convdiv);
+}
+
/**
* proc_doulongvec_minmax - read a vector of long integers with min/max values
* @table: the sysctl table
@@ -2250,6 +2390,71 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
do_proc_dointvec_ms_jiffies_conv, NULL);
}
+#ifdef CONFIG_SYSVIPC
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ void *data;
+ struct ipc_namespace *ns;
+
+ ns = current->nsproxy->ipc_ns;
+
+ switch (table->ctl_name) {
+ case KERN_SHMMAX:
+ data = &ns->shm_ctlmax;
+ goto proc_minmax;
+ case KERN_SHMALL:
+ data = &ns->shm_ctlall;
+ goto proc_minmax;
+ case KERN_SHMMNI:
+ data = &ns->shm_ctlmni;
+ break;
+ case KERN_MSGMAX:
+ data = &ns->msg_ctlmax;
+ break;
+ case KERN_MSGMNI:
+ data = &ns->msg_ctlmni;
+ break;
+ case KERN_MSGMNB:
+ data = &ns->msg_ctlmnb;
+ break;
+ case KERN_SEM:
+ data = &ns->sem_ctls;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return __do_proc_dointvec(data, table, write, filp, buffer,
+ lenp, ppos, NULL, NULL);
+proc_minmax:
+ return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+ lenp, ppos, 1l, 1l);
+}
+#endif
+
+static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_nr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
#else /* CONFIG_PROC_FS */
int proc_dostring(ctl_table *table, int write, struct file *filp,
@@ -2258,12 +2463,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
return -ENOSYS;
}
-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
+#ifdef CONFIG_SYSVIPC
+static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+#endif
+
int proc_dointvec(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2318,6 +2531,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SYSCTL_SYSCALL
/*
* General sysctl support routines
*/
@@ -2460,11 +2674,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return 1;
}
-#else /* CONFIG_SYSCTL */
+#else /* CONFIG_SYSCTL_SYSCALL */
asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
{
+ static int msg_count;
+
+ if (msg_count < 5) {
+ msg_count++;
+ printk(KERN_INFO
+ "warning: process `%s' used the removed sysctl "
+ "system call\n", current->comm);
+ }
return -ENOSYS;
}
@@ -2496,73 +2718,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return -ENOSYS;
}
-int proc_dostring(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
- struct file *filp,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-struct ctl_table_header * register_sysctl_table(ctl_table * table,
- int insert_at_head)
-{
- return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SYSCTL_SYSCALL */
/*
* No sense putting this after each symbol definition, twice,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2ed4040d0dc5..5d6a8c54ee85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,7 +18,9 @@
#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
#include <linux/delayacct.h>
+#include <linux/tsacct_kern.h>
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <net/genetlink.h>
@@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
/*
* If new attributes are added, please revisit this allocation
*/
- skb = nlmsg_new(size, GFP_KERNEL);
+ skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
if (!skb)
return -ENOMEM;
@@ -198,7 +200,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
*/
delayacct_add_tsk(stats, tsk);
+
+ /* fill in basic acct fields */
stats->version = TASKSTATS_VERSION;
+ bacct_add_tsk(stats, tsk);
+
+ /* fill in extended acct fields */
+ xacct_add_tsk(stats, tsk);
/* Define err: label here if needed */
put_task_struct(tsk);
diff --git a/kernel/time.c b/kernel/time.c
index 5bd489747643..0e017bff4c19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-/* we call this to notify the arch when the clock is being
- * controlled. If no such arch routine, do nothing.
- */
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
-{
- return;
-}
-
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
- long ltemp, mtemp, save_adjust;
- int result;
-
- /* In order to modify anything, you gotta be super-user! */
- if (txc->modes && !capable(CAP_SYS_TIME))
- return -EPERM;
-
- /* Now we validate the data before disabling interrupts */
-
- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
- /* singleshot must not be used with any other mode bits */
- if (txc->modes != ADJ_OFFSET_SINGLESHOT)
- return -EINVAL;
-
- if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
- /* adjustment Offset limited to +- .512 seconds */
- if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
- return -EINVAL;
-
- /* if the quartz is off by more than 10% something is VERY wrong ! */
- if (txc->modes & ADJ_TICK)
- if (txc->tick < 900000/USER_HZ ||
- txc->tick > 1100000/USER_HZ)
- return -EINVAL;
-
- write_seqlock_irq(&xtime_lock);
- result = time_state; /* mostly `TIME_OK' */
-
- /* Save for later - semantics of adjtime is to return old value */
- save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
-
-#if 0 /* STA_CLOCKERR is never set yet */
- time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
-#endif
- /* If there are input parameters, then process them */
- if (txc->modes)
- {
- if (txc->modes & ADJ_STATUS) /* only set allowed bits */
- time_status = (txc->status & ~STA_RONLY) |
- (time_status & STA_RONLY);
-
- if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
- if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
- result = -EINVAL;
- goto leave;
- }
- time_freq = txc->freq;
- }
-
- if (txc->modes & ADJ_MAXERROR) {
- if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
- result = -EINVAL;
- goto leave;
- }
- time_maxerror = txc->maxerror;
- }
-
- if (txc->modes & ADJ_ESTERROR) {
- if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
- result = -EINVAL;
- goto leave;
- }
- time_esterror = txc->esterror;
- }
-
- if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
- if (txc->constant < 0) { /* NTP v4 uses values > 6 */
- result = -EINVAL;
- goto leave;
- }
- time_constant = txc->constant;
- }
-
- if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
- if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
- /* adjtime() is independent from ntp_adjtime() */
- if ((time_next_adjust = txc->offset) == 0)
- time_adjust = 0;
- }
- else if (time_status & STA_PLL) {
- ltemp = txc->offset;
-
- /*
- * Scale the phase adjustment and
- * clamp to the operating range.
- */
- if (ltemp > MAXPHASE)
- time_offset = MAXPHASE << SHIFT_UPDATE;
- else if (ltemp < -MAXPHASE)
- time_offset = -(MAXPHASE << SHIFT_UPDATE);
- else
- time_offset = ltemp << SHIFT_UPDATE;
-
- /*
- * Select whether the frequency is to be controlled
- * and in which mode (PLL or FLL). Clamp to the operating
- * range. Ugly multiply/divide should be replaced someday.
- */
-
- if (time_status & STA_FREQHOLD || time_reftime == 0)
- time_reftime = xtime.tv_sec;
- mtemp = xtime.tv_sec - time_reftime;
- time_reftime = xtime.tv_sec;
- if (time_status & STA_FLL) {
- if (mtemp >= MINSEC) {
- ltemp = (time_offset / mtemp) << (SHIFT_USEC -
- SHIFT_UPDATE);
- time_freq += shift_right(ltemp, SHIFT_KH);
- } else /* calibration interval too short (p. 12) */
- result = TIME_ERROR;
- } else { /* PLL mode */
- if (mtemp < MAXSEC) {
- ltemp *= mtemp;
- time_freq += shift_right(ltemp,(time_constant +
- time_constant +
- SHIFT_KF - SHIFT_USEC));
- } else /* calibration interval too long (p. 12) */
- result = TIME_ERROR;
- }
- time_freq = min(time_freq, time_tolerance);
- time_freq = max(time_freq, -time_tolerance);
- } /* STA_PLL */
- } /* txc->modes & ADJ_OFFSET */
- if (txc->modes & ADJ_TICK) {
- tick_usec = txc->tick;
- tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
- }
- } /* txc->modes */
-leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
- result = TIME_ERROR;
-
- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
- txc->offset = save_adjust;
- else {
- txc->offset = shift_right(time_offset, SHIFT_UPDATE);
- }
- txc->freq = time_freq;
- txc->maxerror = time_maxerror;
- txc->esterror = time_esterror;
- txc->status = time_status;
- txc->constant = time_constant;
- txc->precision = time_precision;
- txc->tolerance = time_tolerance;
- txc->tick = tick_usec;
-
- /* PPS is not implemented, so these are zero */
- txc->ppsfreq = 0;
- txc->jitter = 0;
- txc->shift = 0;
- txc->stabil = 0;
- txc->jitcnt = 0;
- txc->calcnt = 0;
- txc->errcnt = 0;
- txc->stbcnt = 0;
- write_sequnlock_irq(&xtime_lock);
- do_gettimeofday(&txc->time);
- notify_arch_cmos_timer();
- return(result);
-}
-
asmlinkage long sys_adjtimex(struct timex __user *txc_p)
{
struct timex txc; /* Local copy of parameter */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e1dfd8e86cce..61a3907d16fb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1 @@
-obj-y += clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 126bb30c4afe..a99b2a6e6a07 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -57,7 +57,7 @@ static cycle_t jiffies_read(void)
struct clocksource clocksource_jiffies = {
.name = "jiffies",
- .rating = 0, /* lowest rating*/
+ .rating = 1, /* lowest valid rating*/
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 000000000000..47195fa0ec4f
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,350 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#include <asm/div64.h>
+#include <asm/timex.h>
+
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
+unsigned long tick_nsec; /* ACTHZ period (nsec) */
+static u64 tick_length, tick_length_base;
+
+#define MAX_TICKADJ 500 /* microsecs */
+#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+ TICK_LENGTH_SHIFT) / HZ)
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+static int time_state = TIME_OK; /* clock synchronization status */
+int time_status = STA_UNSYNC; /* clock status bits */
+static long time_offset; /* time adjustment (ns) */
+static long time_constant = 2; /* pll time constant */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
+long time_freq; /* frequency offset (scaled ppm)*/
+static long time_reftime; /* time at last adjustment (s) */
+long time_adjust;
+
+#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
+ (s64)CLOCK_TICK_RATE)
+
+static void ntp_update_frequency(void)
+{
+ tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+ tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+
+ do_div(tick_length_base, HZ);
+
+ tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+
+ ntp_update_frequency();
+
+ tick_length = tick_length_base;
+ time_offset = 0;
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+ long time_adj;
+
+ /* Bump the maxerror field */
+ time_maxerror += MAXFREQ >> SHIFT_USEC;
+ if (time_maxerror > NTP_PHASE_LIMIT) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second. The microtime()
+ * routine or external clock driver will insure that reported time is
+ * always monotonic. The ugly divides should be replaced.
+ */
+ switch (time_state) {
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+ case TIME_INS:
+ if (xtime.tv_sec % 86400 == 0) {
+ xtime.tv_sec--;
+ wall_to_monotonic.tv_sec++;
+ /*
+ * The timer interpolator will make time change
+ * gradually instead of an immediate jump by one second
+ */
+ time_interpolator_update(-NSEC_PER_SEC);
+ time_state = TIME_OOP;
+ clock_was_set();
+ printk(KERN_NOTICE "Clock: inserting leap second "
+ "23:59:60 UTC\n");
+ }
+ break;
+ case TIME_DEL:
+ if ((xtime.tv_sec + 1) % 86400 == 0) {
+ xtime.tv_sec++;
+ wall_to_monotonic.tv_sec--;
+ /*
+ * Use of time interpolator for a gradual change of
+ * time
+ */
+ time_interpolator_update(NSEC_PER_SEC);
+ time_state = TIME_WAIT;
+ clock_was_set();
+ printk(KERN_NOTICE "Clock: deleting leap second "
+ "23:59:59 UTC\n");
+ }
+ break;
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the phase adjustment for the next second. The offset is
+ * reduced by a fixed factor times the time constant.
+ */
+ tick_length = tick_length_base;
+ time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
+ time_offset -= time_adj;
+ tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
+
+ if (unlikely(time_adjust)) {
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ } else if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ } else {
+ time_adjust = 0;
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+ HZ) << TICK_LENGTH_SHIFT;
+ }
+ }
+}
+
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+ return tick_length;
+}
+
+
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+ return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+ long ltemp, mtemp, save_adjust;
+ s64 freq_adj, temp64;
+ int result;
+
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* Now we validate the data before disabling interrupts */
+
+ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ /* singleshot must not be used with any other mode bits */
+ if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+ return -EINVAL;
+
+ if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+ /* adjustment Offset limited to +- .512 seconds */
+ if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+ return -EINVAL;
+
+ /* if the quartz is off by more than 10% something is VERY wrong ! */
+ if (txc->modes & ADJ_TICK)
+ if (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ)
+ return -EINVAL;
+
+ write_seqlock_irq(&xtime_lock);
+ result = time_state; /* mostly `TIME_OK' */
+
+ /* Save for later - semantics of adjtime is to return old value */
+ save_adjust = time_adjust;
+
+#if 0 /* STA_CLOCKERR is never set yet */
+ time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
+#endif
+ /* If there are input parameters, then process them */
+ if (txc->modes)
+ {
+ if (txc->modes & ADJ_STATUS) /* only set allowed bits */
+ time_status = (txc->status & ~STA_RONLY) |
+ (time_status & STA_RONLY);
+
+ if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
+ if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
+ }
+
+ if (txc->modes & ADJ_MAXERROR) {
+ if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_maxerror = txc->maxerror;
+ }
+
+ if (txc->modes & ADJ_ESTERROR) {
+ if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_esterror = txc->esterror;
+ }
+
+ if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
+ if (txc->constant < 0) { /* NTP v4 uses values > 6 */
+ result = -EINVAL;
+ goto leave;
+ }
+ time_constant = min(txc->constant + 4, (long)MAXTC);
+ }
+
+ if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
+ if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ }
+ else if (time_status & STA_PLL) {
+ ltemp = txc->offset * NSEC_PER_USEC;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
+ time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
+
+ /*
+ * Select whether the frequency is to be controlled
+ * and in which mode (PLL or FLL). Clamp to the operating
+ * range. Ugly multiply/divide should be replaced someday.
+ */
+
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = xtime.tv_sec;
+ mtemp = xtime.tv_sec - time_reftime;
+ time_reftime = xtime.tv_sec;
+
+ freq_adj = (s64)time_offset * mtemp;
+ freq_adj = shift_right(freq_adj, time_constant * 2 +
+ (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+ if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+ if (time_offset < 0) {
+ temp64 = -temp64;
+ do_div(temp64, mtemp);
+ freq_adj -= temp64;
+ } else {
+ do_div(temp64, mtemp);
+ freq_adj += temp64;
+ }
+ }
+ freq_adj += time_freq;
+ freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+ time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+ time_offset = (time_offset / HZ) << SHIFT_UPDATE;
+ } /* STA_PLL */
+ } /* txc->modes & ADJ_OFFSET */
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+ } /* txc->modes */
+leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+ result = TIME_ERROR;
+
+ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ txc->offset = save_adjust;
+ else
+ txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
+ txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = 1;
+ txc->tolerance = MAXFREQ;
+ txc->tick = tick_usec;
+
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+ write_sequnlock_irq(&xtime_lock);
+ do_gettimeofday(&txc->time);
+ notify_arch_cmos_timer();
+ return(result);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2d..c1c7fbcffec1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,12 +41,6 @@
#include <asm/timex.h>
#include <asm/io.h>
-#ifdef CONFIG_TIME_INTERPOLATION
-static void time_interpolator_update(long delta_nsec);
-#else
-#define time_interpolator_update(x)
-#endif
-
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
@@ -136,7 +130,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
-/***
+/**
* init_timer - initialize a timer.
* @timer: the timer to be initialized
*
@@ -175,6 +169,7 @@ static inline void detach_timer(struct timer_list *timer,
*/
static tvec_base_t *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
+ __acquires(timer->base->lock)
{
tvec_base_t *base;
@@ -235,7 +230,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(__mod_timer);
-/***
+/**
* add_timer_on - start a timer on a particular CPU
* @timer: the timer to be added
* @cpu: the CPU to start it on
@@ -255,9 +250,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
-/***
+/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
+ * @expires: new timeout in jiffies
*
* mod_timer is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
@@ -291,7 +287,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer);
-/***
+/**
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
*
@@ -323,7 +319,10 @@ int del_timer(struct timer_list *timer)
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
*
@@ -351,7 +350,7 @@ out:
return ret;
}
-/***
+/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
@@ -401,15 +400,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
return index;
}
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
*
* This function cascades all vectors and executes all expired timer
* vectors.
*/
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
static inline void __run_timers(tvec_base_t *base)
{
struct timer_list *timer;
@@ -563,12 +562,6 @@ found:
/******************************************************************/
-/*
- * Timekeeping variables
- */
-unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
-unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
-
/*
* The current time
* wall_to_monotonic is what we need to add to xtime (or xtime corrected
@@ -582,209 +575,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
EXPORT_SYMBOL(xtime);
-/* Don't completely fail for HZ > 500. */
-int tickadj = 500/HZ ? : 1; /* microsecs */
-
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK; /* clock synchronization status */
-int time_status = STA_UNSYNC; /* clock status bits */
-long time_offset; /* time adjustment (us) */
-long time_constant = 2; /* pll time constant */
-long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
-long time_precision = 1; /* clock precision (us) */
-long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
-long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
- /* frequency offset (scaled ppm)*/
-static long time_adj; /* tick adjust (scaled 1 / HZ) */
-long time_reftime; /* time at last adjustment (s) */
-long time_adjust;
-long time_next_adjust;
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
- long ltemp;
-
- /* Bump the maxerror field */
- time_maxerror += time_tolerance >> SHIFT_USEC;
- if (time_maxerror > NTP_PHASE_LIMIT) {
- time_maxerror = NTP_PHASE_LIMIT;
- time_status |= STA_UNSYNC;
- }
-
- /*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second. The microtime()
- * routine or external clock driver will insure that reported time is
- * always monotonic. The ugly divides should be replaced.
- */
- switch (time_state) {
- case TIME_OK:
- if (time_status & STA_INS)
- time_state = TIME_INS;
- else if (time_status & STA_DEL)
- time_state = TIME_DEL;
- break;
- case TIME_INS:
- if (xtime.tv_sec % 86400 == 0) {
- xtime.tv_sec--;
- wall_to_monotonic.tv_sec++;
- /*
- * The timer interpolator will make time change
- * gradually instead of an immediate jump by one second
- */
- time_interpolator_update(-NSEC_PER_SEC);
- time_state = TIME_OOP;
- clock_was_set();
- printk(KERN_NOTICE "Clock: inserting leap second "
- "23:59:60 UTC\n");
- }
- break;
- case TIME_DEL:
- if ((xtime.tv_sec + 1) % 86400 == 0) {
- xtime.tv_sec++;
- wall_to_monotonic.tv_sec--;
- /*
- * Use of time interpolator for a gradual change of
- * time
- */
- time_interpolator_update(NSEC_PER_SEC);
- time_state = TIME_WAIT;
- clock_was_set();
- printk(KERN_NOTICE "Clock: deleting leap second "
- "23:59:59 UTC\n");
- }
- break;
- case TIME_OOP:
- time_state = TIME_WAIT;
- break;
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- }
-
- /*
- * Compute the phase adjustment for the next second. In PLL mode, the
- * offset is reduced by a fixed factor times the time constant. In FLL
- * mode the offset is used directly. In either mode, the maximum phase
- * adjustment for each second is clamped so as to spread the adjustment
- * over not more than the number of seconds between updates.
- */
- ltemp = time_offset;
- if (!(time_status & STA_FLL))
- ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
- ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
- ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
- time_offset -= ltemp;
- time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
- /*
- * Compute the frequency estimate and additional phase adjustment due
- * to frequency error for the next second.
- */
- ltemp = time_freq;
- time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
-
-#if HZ == 100
- /*
- * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
- * get 128.125; => only 0.125% error (p. 14)
- */
- time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
- /*
- * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
- * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
- */
- time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
- /*
- * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
- * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
- */
- time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
- long time_adjust_step;
-
- time_adjust_step = time_adjust;
- if (time_adjust_step) {
- /*
- * We are doing an adjtime thing. Prepare time_adjust_step to
- * be within bounds. Note that a positive time_adjust means we
- * want the clock to run faster.
- *
- * Limit the amount of the step to be in the range
- * -tickadj .. +tickadj
- */
- time_adjust_step = min(time_adjust_step, (long)tickadj);
- time_adjust_step = max(time_adjust_step, (long)-tickadj);
- }
- return time_adjust_step;
-}
-
-/* in the NTP reference this is called "hardclock()" */
-static void update_ntp_one_tick(void)
-{
- long time_adjust_step;
-
- time_adjust_step = adjtime_adjustment();
- if (time_adjust_step)
- /* Reduce by this step the amount of time left */
- time_adjust -= time_adjust_step;
-
- /* Changes by adjtime() do not take effect till next tick. */
- if (time_next_adjust != 0) {
- time_adjust = time_next_adjust;
- time_next_adjust = 0;
- }
-}
-
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
- long delta_nsec;
- u64 ret;
-
- /* calculate the finest interval NTP will allow.
- * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
- */
- delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
- ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
- ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
-
- return ret;
-}
/* XXX - all of this timekeeping code should be later moved to time.c */
#include <linux/clocksource.h>
@@ -961,21 +751,24 @@ void __init timekeeping_init(void)
unsigned long flags;
write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_clear();
+
clock = clocksource_get_next();
clocksource_calculate_interval(clock, tick_nsec);
clock->cycle_last = clocksource_read(clock);
- ntp_clear();
+
write_sequnlock_irqrestore(&xtime_lock, flags);
}
static int timekeeping_suspended;
-/*
+/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
* @dev: unused
*
* This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
static int timekeeping_resume(struct sys_device *dev)
@@ -1106,7 +899,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
}
-/*
+/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1144,8 +937,6 @@ static void update_wall_time(void)
/* interpolator bits */
time_interpolator_update(clock->xtime_interval
>> clock->shift);
- /* increment the NTP state machine */
- update_ntp_one_tick();
/* accumulate error between NTP and clock interval */
clock->error += current_tick_length();
@@ -1217,19 +1008,14 @@ static inline void calc_load(unsigned long ticks)
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
- count -= ticks;
- if (count < 0) {
- count += LOAD_FREQ;
- active_tasks = count_active_tasks();
+ active_tasks = count_active_tasks();
+ for (count -= ticks; count < 0; count += LOAD_FREQ) {
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
}
}
-/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies = INITIAL_JIFFIES;
-
/*
* This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun.
@@ -1265,12 +1051,8 @@ void run_local_timers(void)
* Called by the timer interrupt. xtime_lock must already be taken
* by the timer IRQ!
*/
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
{
- unsigned long ticks;
-
- ticks = jiffies - wall_jiffies;
- wall_jiffies += ticks;
update_wall_time();
calc_load(ticks);
}
@@ -1281,12 +1063,10 @@ static inline void update_times(void)
* jiffies is defined in the linker script...
*/
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
{
- jiffies_64++;
- /* prevent loading jiffies before storing new jiffies_64 value. */
- barrier();
- update_times();
+ jiffies_64 += ticks;
+ update_times(ticks);
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1470,8 +1250,9 @@ asmlinkage long sys_gettid(void)
return current->pid;
}
-/*
+/**
* sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
*/
asmlinkage long sys_sysinfo(struct sysinfo __user *info)
{
@@ -1688,8 +1469,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
void __init init_timers(void)
{
- timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+
+ BUG_ON(err == NOTIFY_BAD);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
}
@@ -1774,7 +1557,7 @@ unsigned long time_interpolator_get_offset(void)
#define INTERPOLATOR_ADJUST 65536
#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
-static void time_interpolator_update(long delta_nsec)
+void time_interpolator_update(long delta_nsec)
{
u64 counter;
unsigned long offset;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 000000000000..db443221ba5b
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,124 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan, <jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+#include <linux/jiffies.h>
+
+
+#define USEC_PER_TICK (USEC_PER_SEC/HZ)
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ struct timespec uptime, ts;
+ s64 ac_etime;
+
+ BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+ /* calculate task elapsed time in timespec */
+ do_posix_clock_monotonic_gettime(&uptime);
+ ts = timespec_sub(uptime, current->group_leader->start_time);
+ /* rebase elapsed time to usec */
+ ac_etime = timespec_to_ns(&ts);
+ do_div(ac_etime, NSEC_PER_USEC);
+ stats->ac_etime = ac_etime;
+ stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+ if (thread_group_leader(tsk)) {
+ stats->ac_exitcode = tsk->exit_code;
+ if (tsk->flags & PF_FORKNOEXEC)
+ stats->ac_flag |= AFORK;
+ }
+ if (tsk->flags & PF_SUPERPRIV)
+ stats->ac_flag |= ASU;
+ if (tsk->flags & PF_DUMPCORE)
+ stats->ac_flag |= ACORE;
+ if (tsk->flags & PF_SIGNALED)
+ stats->ac_flag |= AXSIG;
+ stats->ac_nice = task_nice(tsk);
+ stats->ac_sched = tsk->policy;
+ stats->ac_uid = tsk->uid;
+ stats->ac_gid = tsk->gid;
+ stats->ac_pid = tsk->pid;
+ stats->ac_ppid = (tsk->parent) ? tsk->parent->pid : 0;
+ stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+ stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+ stats->ac_minflt = tsk->min_flt;
+ stats->ac_majflt = tsk->maj_flt;
+
+ strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
+
+#ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+ /* convert pages-jiffies to Mbyte-usec */
+ stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+ stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+ if (p->mm) {
+ /* adjust to KB unit */
+ stats->hiwater_rss = p->mm->hiwater_rss * PAGE_SIZE / KB;
+ stats->hiwater_vm = p->mm->hiwater_vm * PAGE_SIZE / KB;
+ }
+ stats->read_char = p->rchar;
+ stats->write_char = p->wchar;
+ stats->read_syscalls = p->syscr;
+ stats->write_syscalls = p->syscw;
+}
+#undef KB
+#undef MB
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ if (likely(tsk->mm)) {
+ long delta = cputime_to_jiffies(
+ cputime_sub(tsk->stime, tsk->acct_stimexpd));
+
+ if (delta == 0)
+ return;
+ tsk->acct_stimexpd = tsk->stime;
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ }
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+ tsk->acct_stimexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
+}
+#endif
diff --git a/kernel/unwind.c b/kernel/unwind.c
index 3430475fcd88..f7e50d16dbf6 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -11,13 +11,15 @@
#include <linux/unwind.h>
#include <linux/module.h>
-#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/sort.h>
#include <linux/stop_machine.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
extern char __start_unwind[], __end_unwind[];
+extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
#define MAX_STACK_DEPTH 8
@@ -100,9 +102,11 @@ static struct unwind_table {
} core, init;
const void *address;
unsigned long size;
+ const unsigned char *header;
+ unsigned long hdrsz;
struct unwind_table *link;
const char *name;
-} root_table, *last_table;
+} root_table;
struct unwind_item {
enum item_location {
@@ -145,6 +149,10 @@ static struct unwind_table *find_table(unsigned long pc)
return table;
}
+static unsigned long read_pointer(const u8 **pLoc,
+ const void *end,
+ signed ptrType);
+
static void init_unwind_table(struct unwind_table *table,
const char *name,
const void *core_start,
@@ -152,14 +160,30 @@ static void init_unwind_table(struct unwind_table *table,
const void *init_start,
unsigned long init_size,
const void *table_start,
- unsigned long table_size)
+ unsigned long table_size,
+ const u8 *header_start,
+ unsigned long header_size)
{
+ const u8 *ptr = header_start + 4;
+ const u8 *end = header_start + header_size;
+
table->core.pc = (unsigned long)core_start;
table->core.range = core_size;
table->init.pc = (unsigned long)init_start;
table->init.range = init_size;
table->address = table_start;
table->size = table_size;
+ /* See if the linker provided table looks valid. */
+ if (header_size <= 4
+ || header_start[0] != 1
+ || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
+ || header_start[2] == DW_EH_PE_omit
+ || read_pointer(&ptr, end, header_start[2]) <= 0
+ || header_start[3] == DW_EH_PE_omit)
+ header_start = NULL;
+ table->hdrsz = header_size;
+ smp_wmb();
+ table->header = header_start;
table->link = NULL;
table->name = name;
}
@@ -169,11 +193,149 @@ void __init unwind_init(void)
init_unwind_table(&root_table, "kernel",
_text, _end - _text,
NULL, 0,
- __start_unwind, __end_unwind - __start_unwind);
+ __start_unwind, __end_unwind - __start_unwind,
+ __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);
+}
+
+static const u32 bad_cie, not_fde;
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *);
+static signed fde_pointer_type(const u32 *cie);
+
+struct eh_frame_hdr_table_entry {
+ unsigned long start, fde;
+};
+
+static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
+{
+ const struct eh_frame_hdr_table_entry *e1 = p1;
+ const struct eh_frame_hdr_table_entry *e2 = p2;
+
+ return (e1->start > e2->start) - (e1->start < e2->start);
+}
+
+static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
+{
+ struct eh_frame_hdr_table_entry *e1 = p1;
+ struct eh_frame_hdr_table_entry *e2 = p2;
+ unsigned long v;
+
+ v = e1->start;
+ e1->start = e2->start;
+ e2->start = v;
+ v = e1->fde;
+ e1->fde = e2->fde;
+ e2->fde = v;
+}
+
+static void __init setup_unwind_table(struct unwind_table *table,
+ void *(*alloc)(unsigned long))
+{
+ const u8 *ptr;
+ unsigned long tableSize = table->size, hdrSize;
+ unsigned n;
+ const u32 *fde;
+ struct {
+ u8 version;
+ u8 eh_frame_ptr_enc;
+ u8 fde_count_enc;
+ u8 table_enc;
+ unsigned long eh_frame_ptr;
+ unsigned int fde_count;
+ struct eh_frame_hdr_table_entry table[];
+ } __attribute__((__packed__)) *header;
+
+ if (table->header)
+ return;
+
+ if (table->hdrsz)
+ printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n",
+ table->name);
+
+ if (tableSize & (sizeof(*fde) - 1))
+ return;
+
+ for (fde = table->address, n = 0;
+ tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+ tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+ const u32 *cie = cie_for_fde(fde, table);
+ signed ptrType;
+
+ if (cie == &not_fde)
+ continue;
+ if (cie == NULL
+ || cie == &bad_cie
+ || (ptrType = fde_pointer_type(cie)) < 0)
+ return;
+ ptr = (const u8 *)(fde + 2);
+ if (!read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType))
+ return;
+ ++n;
+ }
+
+ if (tableSize || !n)
+ return;
+
+ hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
+ + 2 * n * sizeof(unsigned long);
+ header = alloc(hdrSize);
+ if (!header)
+ return;
+ header->version = 1;
+ header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native;
+ header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4;
+ header->table_enc = DW_EH_PE_abs|DW_EH_PE_native;
+ put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
+ BUILD_BUG_ON(offsetof(typeof(*header), fde_count)
+ % __alignof(typeof(header->fde_count)));
+ header->fde_count = n;
+
+ BUILD_BUG_ON(offsetof(typeof(*header), table)
+ % __alignof(typeof(*header->table)));
+ for (fde = table->address, tableSize = table->size, n = 0;
+ tableSize;
+ tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+ const u32 *cie = fde + 1 - fde[1] / sizeof(*fde);
+
+ if (!fde[1])
+ continue; /* this is a CIE */
+ ptr = (const u8 *)(fde + 2);
+ header->table[n].start = read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ fde_pointer_type(cie));
+ header->table[n].fde = (unsigned long)fde;
+ ++n;
+ }
+ WARN_ON(n != header->fde_count);
+
+ sort(header->table,
+ n,
+ sizeof(*header->table),
+ cmp_eh_frame_hdr_table_entries,
+ swap_eh_frame_hdr_table_entries);
+
+ table->hdrsz = hdrSize;
+ smp_wmb();
+ table->header = (const void *)header;
+}
+
+static void *__init balloc(unsigned long sz)
+{
+ return __alloc_bootmem_nopanic(sz,
+ sizeof(unsigned int),
+ __pa(MAX_DMA_ADDRESS));
+}
+
+void __init unwind_setup(void)
+{
+ setup_unwind_table(&root_table, balloc);
}
#ifdef CONFIG_MODULES
+static struct unwind_table *last_table;
+
/* Must be called with module_mutex held. */
void *unwind_add_table(struct module *module,
const void *table_start,
@@ -191,7 +353,8 @@ void *unwind_add_table(struct module *module,
init_unwind_table(table, module->name,
module->module_core, module->core_size,
module->module_init, module->init_size,
- table_start, table_size);
+ table_start, table_size,
+ NULL, 0);
if (last_table)
last_table->link = table;
@@ -301,6 +464,26 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
return value;
}
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
+{
+ const u32 *cie;
+
+ if (!*fde || (*fde & (sizeof(*fde) - 1)))
+ return &bad_cie;
+ if (!fde[1])
+ return &not_fde; /* this is a CIE */
+ if ((fde[1] & (sizeof(*fde) - 1))
+ || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
+ return NULL; /* this is not a valid FDE */
+ cie = fde + 1 - fde[1] / sizeof(*fde);
+ if (*cie <= sizeof(*cie) + 4
+ || *cie >= fde[1] - sizeof(*fde)
+ || (*cie & (sizeof(*cie) - 1))
+ || cie[1])
+ return NULL; /* this is not a (valid) CIE */
+ return cie;
+}
+
static unsigned long read_pointer(const u8 **pLoc,
const void *end,
signed ptrType)
@@ -608,49 +791,108 @@ int unwind(struct unwind_frame_info *frame)
unsigned i;
signed ptrType = -1;
uleb128_t retAddrReg = 0;
- struct unwind_table *table;
+ const struct unwind_table *table;
struct unwind_state state;
if (UNW_PC(frame) == 0)
return -EINVAL;
if ((table = find_table(pc)) != NULL
&& !(table->size & (sizeof(*fde) - 1))) {
- unsigned long tableSize = table->size;
-
- for (fde = table->address;
- tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
- tableSize -= sizeof(*fde) + *fde,
- fde += 1 + *fde / sizeof(*fde)) {
- if (!*fde || (*fde & (sizeof(*fde) - 1)))
- break;
- if (!fde[1])
- continue; /* this is a CIE */
- if ((fde[1] & (sizeof(*fde) - 1))
- || fde[1] > (unsigned long)(fde + 1)
- - (unsigned long)table->address)
- continue; /* this is not a valid FDE */
- cie = fde + 1 - fde[1] / sizeof(*fde);
- if (*cie <= sizeof(*cie) + 4
- || *cie >= fde[1] - sizeof(*fde)
- || (*cie & (sizeof(*cie) - 1))
- || cie[1]
- || (ptrType = fde_pointer_type(cie)) < 0) {
- cie = NULL; /* this is not a (valid) CIE */
- continue;
+ const u8 *hdr = table->header;
+ unsigned long tableSize;
+
+ smp_rmb();
+ if (hdr && hdr[0] == 1) {
+ switch(hdr[3] & DW_EH_PE_FORM) {
+ case DW_EH_PE_native: tableSize = sizeof(unsigned long); break;
+ case DW_EH_PE_data2: tableSize = 2; break;
+ case DW_EH_PE_data4: tableSize = 4; break;
+ case DW_EH_PE_data8: tableSize = 8; break;
+ default: tableSize = 0; break;
+ }
+ ptr = hdr + 4;
+ end = hdr + table->hdrsz;
+ if (tableSize
+ && read_pointer(&ptr, end, hdr[1])
+ == (unsigned long)table->address
+ && (i = read_pointer(&ptr, end, hdr[2])) > 0
+ && i == (end - ptr) / (2 * tableSize)
+ && !((end - ptr) % (2 * tableSize))) {
+ do {
+ const u8 *cur = ptr + (i / 2) * (2 * tableSize);
+
+ startLoc = read_pointer(&cur,
+ cur + tableSize,
+ hdr[3]);
+ if (pc < startLoc)
+ i /= 2;
+ else {
+ ptr = cur - tableSize;
+ i = (i + 1) / 2;
+ }
+ } while (startLoc && i > 1);
+ if (i == 1
+ && (startLoc = read_pointer(&ptr,
+ ptr + tableSize,
+ hdr[3])) != 0
+ && pc >= startLoc)
+ fde = (void *)read_pointer(&ptr,
+ ptr + tableSize,
+ hdr[3]);
}
+ }
+
+ if (fde != NULL) {
+ cie = cie_for_fde(fde, table);
ptr = (const u8 *)(fde + 2);
- startLoc = read_pointer(&ptr,
- (const u8 *)(fde + 1) + *fde,
- ptrType);
- endLoc = startLoc
- + read_pointer(&ptr,
- (const u8 *)(fde + 1) + *fde,
- ptrType & DW_EH_PE_indirect
- ? ptrType
- : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
- if (pc >= startLoc && pc < endLoc)
- break;
- cie = NULL;
+ if(cie != NULL
+ && cie != &bad_cie
+ && cie != &not_fde
+ && (ptrType = fde_pointer_type(cie)) >= 0
+ && read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType) == startLoc) {
+ if (!(ptrType & DW_EH_PE_indirect))
+ ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+ endLoc = startLoc
+ + read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType);
+ if(pc >= endLoc)
+ fde = NULL;
+ } else
+ fde = NULL;
+ }
+ if (fde == NULL) {
+ for (fde = table->address, tableSize = table->size;
+ cie = NULL, tableSize > sizeof(*fde)
+ && tableSize - sizeof(*fde) >= *fde;
+ tableSize -= sizeof(*fde) + *fde,
+ fde += 1 + *fde / sizeof(*fde)) {
+ cie = cie_for_fde(fde, table);
+ if (cie == &bad_cie) {
+ cie = NULL;
+ break;
+ }
+ if (cie == NULL
+ || cie == &not_fde
+ || (ptrType = fde_pointer_type(cie)) < 0)
+ continue;
+ ptr = (const u8 *)(fde + 2);
+ startLoc = read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType);
+ if (!startLoc)
+ continue;
+ if (!(ptrType & DW_EH_PE_indirect))
+ ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+ endLoc = startLoc
+ + read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType);
+ if (pc >= startLoc && pc < endLoc)
+ break;
+ }
}
}
if (cie != NULL) {
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 000000000000..c859164a6993
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+{
+ struct uts_namespace *ns;
+
+ ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (ns) {
+ memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ kref_init(&ns->kref);
+ }
+ return ns;
+}
+
+/*
+ * unshare the current process' utsname namespace.
+ * called only in sys_unshare()
+ */
+int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
+{
+ if (unshare_flags & CLONE_NEWUTS) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
+ if (!*new_uts)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy task tsk's utsname namespace, or clone it if flags
+ * specifies CLONE_NEWUTS. In latter case, changes to the
+ * utsname of this process won't be seen by parent, and vice
+ * versa.
+ */
+int copy_utsname(int flags, struct task_struct *tsk)
+{
+ struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
+ struct uts_namespace *new_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_uts_ns(old_ns);
+
+ if (!(flags & CLONE_NEWUTS))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ new_ns = clone_uts_ns(old_ns);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+ tsk->nsproxy->uts_ns = new_ns;
+
+out:
+ put_uts_ns(old_ns);
+ return err;
+}
+
+void free_uts_ns(struct kref *kref)
+{
+ struct uts_namespace *ns;
+
+ ns = container_of(kref, struct uts_namespace, kref);
+ kfree(ns);
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 835fe28b87a8..3df9bfc7ff78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -28,13 +28,14 @@
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
+#include <linux/mempolicy.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
*
* The sequence counters are for flush_scheduled_work(). It wants to wait
- * until until all currently-scheduled works are completed, but it doesn't
+ * until all currently-scheduled works are completed, but it doesn't
* want to be livelocked by new, incoming ones. So it waits until
* remove_sequence is >= the insert_sequence which pertained when
* flush_scheduled_work() was called.
@@ -245,6 +246,12 @@ static int worker_thread(void *__cwq)
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
+ /*
+ * We inherited MPOL_INTERLEAVE from the booting kernel.
+ * Set MPOL_DEFAULT to insure node local allocations.
+ */
+ numa_default_policy();
+
/* SIG_IGN makes children autoreap: see do_notify_parent(). */
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;