aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c171
-rw-r--r--kernel/cgroup.c175
-rw-r--r--kernel/cpuset.c162
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c266
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c43
-rw-r--r--kernel/kprobes.c20
-rw-r--r--kernel/module.c170
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/params.c97
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig7
-rw-r--r--kernel/power/hibernate.c14
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c96
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c1
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/ftrace.c330
-rw-r--r--kernel/trace/ring_buffer.c75
-rw-r--r--kernel/trace/trace.c255
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c57
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c21
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_syscalls.c54
-rw-r--r--kernel/trace/trace_uprobe.c28
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/workqueue.c30
76 files changed, 2973 insertions, 2313 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 1f37f15117e5..f8f203e8018c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_skb_queue);
- s.version = AUDIT_VERSION_LATEST;
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
s.backlog_wait_time = audit_backlog_wait_time;
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
- if (!entry->i.inode)
+ if (!entry->inode)
return;
- list = chunk_hash(entry->i.inode);
+ list = chunk_hash(entry->inode);
list_add_rcu(&chunk->hash, list);
}
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
/* mark.inode may have gone NULL, but who cares? */
- if (p->mark.i.inode == inode) {
+ if (p->mark.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
new = alloc_chunk(size);
spin_lock(&entry->lock);
- if (chunk->dead || !entry->i.inode) {
+ if (chunk->dead || !entry->inode) {
spin_unlock(&entry->lock);
if (new)
free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
goto Fallback;
fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk_entry = &chunk->mark;
spin_lock(&old_entry->lock);
- if (!old_entry->i.inode) {
+ if (!old_entry->inode) {
/* old_entry is being shot, lets just lie */
spin_unlock(&old_entry->lock);
fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
}
fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.i.inode;
+ struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..c75522a83678 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1897,6 +1897,11 @@ out:
audit_copy_inode(n, dentry, inode);
}
+void __audit_file(const struct file *file)
+{
+ __audit_inode(NULL, file->f_path.dentry, 0);
+}
+
/**
* __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
@@ -2373,7 +2378,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_dentry);
+ dentry = dget(bprm->file->f_path.dentry);
get_vfs_caps_from_disk(dentry, &vcaps);
dput(dentry);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
ifdef CONFIG_TEST_BPF
obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size, array_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&tl);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
- err = -ESRCH;
+ err = -ENOENT;
rcu_read_lock();
value = map->ops->map_lookup_elem(map, key);
if (!value)
@@ -190,7 +190,7 @@ err_put:
return err;
}
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(union bpf_attr *attr)
{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
* therefore all map accessors rely on this fact, so do the same here
*/
rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value);
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
rcu_read_unlock();
free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
u64 arg2;
};
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
- [BPF_FUNC_unspec] = {
- .func = test_func,
- .gpl_only = true,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- },
-};
-
static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
{
- if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ default:
return NULL;
- return &test_funcs[func_id];
+ }
}
static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
.type = BPF_PROG_TYPE_UNSPEC,
};
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map *map;
-
- map = kzalloc(sizeof(*map), GFP_USER);
- if (!map)
- return ERR_PTR(-ENOMEM);
-
- map->key_size = attr->key_size;
- map->value_size = attr->value_size;
- map->max_entries = attr->max_entries;
- return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
- kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
- .map_alloc = test_map_alloc,
- .map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
- .ops = &test_map_ops,
- .type = BPF_MAP_TYPE_UNSPEC,
-};
-
static int __init register_test_ops(void)
{
- bpf_register_map_type(&tl_map);
bpf_register_prog_type(&tl_prog);
return 0;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
enum bpf_stack_slot_type {
STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* 1st byte of register spilled into stack */
- STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_SPILL, /* register spilled into stack */
STACK_MISC /* BPF program wrote some data into this slot */
};
-struct bpf_stack_slot {
- enum bpf_stack_slot_type stype;
- struct reg_state reg_st;
-};
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
/* state of the program:
* type of all registers and stack info
*/
struct verifier_state {
struct reg_state regs[MAX_BPF_REG];
- struct bpf_stack_slot stack[MAX_BPF_STACK];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
};
/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
env->cur_state.regs[i].map_ptr->key_size,
env->cur_state.regs[i].map_ptr->value_size);
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (env->cur_state.stack[i].stype == STACK_SPILL)
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[env->cur_state.stack[i].reg_st.type]);
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
}
verbose("\n");
}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
static int check_stack_write(struct verifier_state *state, int off, int size,
int value_regno)
{
- struct bpf_stack_slot *slot;
int i;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
if (value_regno >= 0 &&
(state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
state->regs[value_regno].type == PTR_TO_CTX)) {
/* register containing pointer is being spilled into stack */
- if (size != 8) {
+ if (size != BPF_REG_SIZE) {
verbose("invalid size of register spill\n");
return -EACCES;
}
- slot = &state->stack[MAX_BPF_STACK + off];
- slot->stype = STACK_SPILL;
/* save register state */
- slot->reg_st = state->regs[value_regno];
- for (i = 1; i < 8; i++) {
- slot = &state->stack[MAX_BPF_STACK + off + i];
- slot->stype = STACK_SPILL_PART;
- slot->reg_st.type = UNKNOWN_VALUE;
- slot->reg_st.map_ptr = NULL;
- }
- } else {
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ state->regs[value_regno];
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ } else {
/* regular write of data into stack */
- for (i = 0; i < size; i++) {
- slot = &state->stack[MAX_BPF_STACK + off + i];
- slot->stype = STACK_MISC;
- slot->reg_st.type = UNKNOWN_VALUE;
- slot->reg_st.map_ptr = NULL;
- }
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
}
return 0;
}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
static int check_stack_read(struct verifier_state *state, int off, int size,
int value_regno)
{
+ u8 *slot_type;
int i;
- struct bpf_stack_slot *slot;
- slot = &state->stack[MAX_BPF_STACK + off];
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
- if (slot->stype == STACK_SPILL) {
- if (size != 8) {
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
verbose("invalid size of register spill\n");
return -EACCES;
}
- for (i = 1; i < 8; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype !=
- STACK_SPILL_PART) {
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
verbose("corrupted spill memory\n");
return -EACCES;
}
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
if (value_regno >= 0)
/* restore register state from stack */
- state->regs[value_regno] = slot->reg_st;
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
return 0;
} else {
for (i = 0; i < size; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype !=
- STACK_MISC) {
+ if (slot_type[i] != STACK_MISC) {
verbose("invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
}
for (i = 0; i < access_size; i++) {
- if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
verbose("invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
}
for (i = 0; i < MAX_BPF_STACK; i++) {
- if (memcmp(&old->stack[i], &cur->stack[i],
- sizeof(old->stack[0])) != 0) {
- if (old->stack[i].stype == STACK_INVALID)
- continue;
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
return false;
- }
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
}
return true;
}
@@ -1664,8 +1741,10 @@ process_bpf_exit:
u8 mode = BPF_MODE(insn->code);
if (mode == BPF_ABS || mode == BPF_IND) {
- verbose("LD_ABS is not supported yet\n");
- return -EINVAL;
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
} else if (mode == BPF_IMM) {
err = check_ld_imm(env, insn);
if (err)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Use ->child_subsys_mask.
+ */
while (cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
return cgroup_css(cgrp, ss);
}
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
* @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
- * This function determines which subsystems need to be enabled given the
- * current @cgrp->subtree_control and records it in
- * @cgrp->child_subsys_mask. The resulting mask is always a superset of
- * @cgrp->subtree_control and follows the usual hierarchy rules.
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied to @cgrp. The returned mask is always
+ * a superset of @subtree_control and follows the usual hierarchy rules.
*/
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned int subtree_control)
{
struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = cgrp->subtree_control;
+ unsigned int cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->child_subsys_mask = cur_ss_mask;
- return;
- }
+ if (!cgroup_on_dfl(cgrp))
+ return cur_ss_mask;
while (true) {
unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
cur_ss_mask = new_ss_mask;
}
- cgrp->child_subsys_mask = cur_ss_mask;
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ cgrp->child_subsys_mask =
+ cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
}
/**
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+ unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
ret = -ENOENT;
goto out_unlock;
}
-
- /*
- * @ss is already enabled through dependency and
- * we'll just make it visible. Skip draining.
- */
- if (cgrp->child_subsys_mask & (1 << ssid))
- continue;
-
- /*
- * Because css offlining is asynchronous, userland
- * might try to re-enable the same controller while
- * the previous instance is still around. In such
- * cases, wait till it's gone using offline_waitq.
- */
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
} else if (disable & (1 << ssid)) {
if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* subsystems than specified may need to be enabled or disabled
* depending on subsystem dependencies.
*/
- cgrp->subtree_control |= enable;
- cgrp->subtree_control &= ~disable;
+ old_sc = cgrp->subtree_control;
+ old_ss = cgrp->child_subsys_mask;
+ new_sc = (old_sc | enable) & ~disable;
+ new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
- old_ctrl = cgrp->child_subsys_mask;
- cgroup_refresh_child_subsys_mask(cgrp);
- new_ctrl = cgrp->child_subsys_mask;
-
- css_enable = ~old_ctrl & new_ctrl;
- css_disable = old_ctrl & ~new_ctrl;
+ css_enable = ~old_ss & new_ss;
+ css_disable = old_ss & ~new_ss;
enable |= css_enable;
disable |= css_disable;
/*
+ * Because css offlining is asynchronous, userland might try to
+ * re-enable the same controller while the previous instance is
+ * still around. In such cases, wait till it's gone using
+ * offline_waitq.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(css_enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+ }
+
+ cgrp->subtree_control = new_sc;
+ cgrp->child_subsys_mask = new_ss;
+
+ /*
* Create new csses or make the existing ones visible. A css is
* created invisible if it's being implicitly enabled through
* dependency. An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
}
+ /*
+ * The effective csses of all the descendants (excluding @cgrp) may
+ * have changed. Subsystems can optionally subscribe to this event
+ * by implementing ->css_e_css_changed() which is invoked if any of
+ * the effective csses seen from the css's cgroup may have changed.
+ */
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+ struct cgroup_subsys_state *css;
+
+ if (!ss->css_e_css_changed || !this_css)
+ continue;
+
+ css_for_each_descendant_pre(css, this_css)
+ if (css != this_css)
+ ss->css_e_css_changed(css);
+ }
+
kernfs_activate(cgrp->kn);
ret = 0;
out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->subtree_control &= ~enable;
- cgrp->subtree_control |= disable;
- cgroup_refresh_child_subsys_mask(cgrp);
+ cgrp->subtree_control = old_sc;
+ cgrp->child_subsys_mask = old_ss;
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
cgroup_idr_remove(&ss->css_idr, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
} else {
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9d0ad7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * and callback_mutex. The latter may nest inside the former. We also
- * require taking task_lock() when dereferencing a task's cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
*
- * A task must hold both mutexes to modify cpusets. If a task holds
+ * A task must hold both locks to modify cpusets. If a task holds
* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_mutex and be able to
+ * is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_mutex to query cpusets.
- * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_mutex, as that would risk double tripping on callback_mutex
+ * callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_mutex, then it has read-only
+ * If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_mutex across
+ * The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
*/
static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
/*
* CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_mutex held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cpuset_mutex held
+ * Call with callback_lock or cpuset_mutex held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
continue;
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
count = seq_get_buf(sf, &buf);
s = buf;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
seq_commit(sf, -1);
}
out_unlock:
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
return ret;
}
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(cs->css.cgroup)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
}
rcu_read_unlock();
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (cgroup_on_dfl(root_css->cgroup)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
{
bool is_empty;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- mutex_lock(&callback_mutex);
+ spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
- mutex_unlock(&callback_mutex);
+ spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- mutex_lock(&callback_mutex);
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
+ unsigned long flags;
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * cpusets, and never sleeps.
- *
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
* (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_mutex. The
+ * Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_mutex
- * mutex.
+ * cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
- *
- * Rule:
- * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
- * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
- * the code that might scan up ancestor cpusets and sleep.
*/
-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
/*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
return 1;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- mutex_lock(&callback_mutex);
+ spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
- mutex_unlock(&callback_mutex);
+ spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
-/*
- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If the task has been OOM killed and has access to memory reserves as
- * specified by the TIF_MEMDIE flag, yes.
- * Otherwise, no.
- *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
- * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the node be in the current task's
- * mems_allowed or that we're in interrupt. It does not scan up the
- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
- * It never sleeps.
- */
-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- return 0;
-}
-
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e19d3ebc29c..4c1ee7f2bebc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_dentry,
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
@@ -7477,11 +7477,11 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
synchronize_rcu();
- perf_install_in_context(ctx, group_leader, event->cpu);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, event->cpu);
+ perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cde34c5..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
int more = 0;
again:
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
if (!prev && !more) {
/*
- * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
info->mm = vma->vm_mm;
info->vaddr = offset_to_vaddr(vma, offset);
}
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_read(mapping);
if (!more)
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..1ea4369890a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
}
/*
- * Accumulate here the counters for all threads but the group leader
- * as they die, so they can be added into the process-wide totals
- * when those are taken. The group leader stays around as a zombie as
- * long as there are other threads. When it gets reaped, the exit.c
- * code will add its counts into these totals. We won't ever get here
- * for the group leader, since it will have been the last reference on
- * the signal_struct.
+ * Accumulate here the counters for all threads as they die. We could
+ * skip the group leader because it is the last user of signal_struct,
+ * but we want to avoid the race with thread_group_cputime() which can
+ * see the empty ->thread_head list.
*/
task_cputime(tsk, &utime, &stime);
write_seqlock(&sig->stats_lock);
@@ -215,27 +212,6 @@ repeat:
}
/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
- struct task_struct *p;
- struct pid *sid = NULL;
-
- p = pid_task(pgrp, PIDTYPE_PGID);
- if (p == NULL)
- p = pid_task(pgrp, PIDTYPE_PID);
- if (p != NULL)
- sid = task_session(p);
-
- return sid;
-}
-
-/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
clear_thread_flag(TIF_MEMDIE);
}
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ if (!(t->flags & PF_EXITING))
+ return t;
+ }
+ return NULL;
+}
+
+static struct task_struct *find_child_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *reaper = pid_ns->child_reaper;
+
+ if (likely(reaper != father))
+ return reaper;
+
+ reaper = find_alive_thread(father);
+ if (reaper) {
+ pid_ns->child_reaper = reaper;
+ return reaper;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?: father->exit_code);
+ }
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+
+ return father;
+}
+
/*
* When we die, we re-parent all our children, and try to:
* 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+static struct task_struct *find_new_reaper(struct task_struct *father,
+ struct task_struct *child_reaper)
{
- struct pid_namespace *pid_ns = task_active_pid_ns(father);
- struct task_struct *thread;
+ struct task_struct *thread, *reaper;
- thread = father;
- while_each_thread(father, thread) {
- if (thread->flags & PF_EXITING)
- continue;
- if (unlikely(pid_ns->child_reaper == father))
- pid_ns->child_reaper = thread;
+ thread = find_alive_thread(father);
+ if (thread)
return thread;
- }
-
- if (unlikely(pid_ns->child_reaper == father)) {
- write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?:
- father->exit_code);
- }
-
- zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
- } else if (father->signal->has_child_subreaper) {
- struct task_struct *reaper;
+ if (father->signal->has_child_subreaper) {
/*
- * Find the first ancestor marked as child_subreaper.
- * Note that the code below checks same_thread_group(reaper,
- * pid_ns->child_reaper). This is what we need to DTRT in a
- * PID namespace. However we still need the check above, see
- * http://marc.info/?l=linux-kernel&m=131385460420380
+ * Find the first ->is_child_subreaper ancestor in our pid_ns.
+ * We start from father to ensure we can not look into another
+ * namespace, this is safe because all its threads are dead.
*/
- for (reaper = father->real_parent;
- reaper != &init_task;
+ for (reaper = father;
+ !same_thread_group(reaper, child_reaper);
reaper = reaper->real_parent) {
- if (same_thread_group(reaper, pid_ns->child_reaper))
+ /* call_usermodehelper() descendants need this check */
+ if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
- thread = reaper;
- do {
- if (!(thread->flags & PF_EXITING))
- return reaper;
- } while_each_thread(reaper, thread);
+ thread = find_alive_thread(reaper);
+ if (thread)
+ return thread;
}
}
- return pid_ns->child_reaper;
+ return child_reaper;
}
/*
@@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- if (p->exit_state == EXIT_DEAD)
- return;
- /*
- * If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
+ if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
@@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
- list_move_tail(&p->sibling, dead);
+ list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+ struct list_head *dead)
{
- struct task_struct *p, *n, *reaper;
- LIST_HEAD(dead_children);
+ struct task_struct *p, *t, *reaper;
- write_lock_irq(&tasklist_lock);
- /*
- * Note that exit_ptrace() and find_new_reaper() might
- * drop tasklist_lock and reacquire it.
- */
- exit_ptrace(father);
- reaper = find_new_reaper(father);
+ if (unlikely(!list_empty(&father->ptraced)))
+ exit_ptrace(father, dead);
- list_for_each_entry_safe(p, n, &father->children, sibling) {
- struct task_struct *t = p;
+ /* Can drop and reacquire tasklist_lock */
+ reaper = find_child_reaper(father);
+ if (list_empty(&father->children))
+ return;
- do {
+ reaper = find_new_reaper(father, reaper);
+ list_for_each_entry(p, &father->children, sibling) {
+ for_each_thread(p, t) {
t->real_parent = reaper;
- if (t->parent == father) {
- BUG_ON(t->ptrace);
+ BUG_ON((!t->ptrace) != (t->parent == father));
+ if (likely(!t->ptrace))
t->parent = t->real_parent;
- }
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t);
- } while_each_thread(p, t);
- reparent_leader(father, p, &dead_children);
- }
- write_unlock_irq(&tasklist_lock);
-
- BUG_ON(!list_empty(&father->children));
-
- list_for_each_entry_safe(p, n, &dead_children, sibling) {
- list_del_init(&p->sibling);
- release_task(p);
+ }
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (!same_thread_group(reaper, father))
+ reparent_leader(father, p, dead);
}
+ list_splice_tail_init(&father->children, &reaper->children);
}
/*
@@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
-
- /*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- */
- forget_original_parent(tsk);
+ struct task_struct *p, *n;
+ LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
+
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+ if (tsk->exit_state == EXIT_DEAD)
+ list_add(&tsk->ptrace_entry, &dead);
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- /* If the process is dead, release it - nobody will wait for it */
- if (autoreap)
- release_task(tsk);
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
}
#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- unsigned long state;
- int retval, status, traced;
+ int state, retval, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
@@ -1008,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
}
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
-
- traced = ptrace_reparented(p);
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+ EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
- * It can be ptraced but not reparented, check
- * thread_group_leader() to filter out sub-threads.
+ * We own this thread, nobody else can reap it.
*/
- if (likely(!traced) && thread_group_leader(p)) {
- struct signal_struct *psig;
- struct signal_struct *sig;
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ /*
+ * Check thread_group_leader() to exclude the traced sub-threads.
+ */
+ if (state == EXIT_DEAD && thread_group_leader(p)) {
+ struct signal_struct *sig = p->signal;
+ struct signal_struct *psig = current->signal;
unsigned long maxrss;
cputime_t tgutime, tgstime;
@@ -1034,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
- * p->signal fields, because they are only touched by
- * __exit_signal, which runs with tasklist_lock
- * write-locked anyway, and so is excluded here. We do
- * need to protect the access to parent->signal fields,
- * as other threads in the parent group can be right
- * here reaping other children at the same time.
+ * p->signal fields because the whole thread group is dead
+ * and nobody can change them.
+ *
+ * psig->stats_lock also protects us from our sub-theads
+ * which can reap other children at the same time. Until
+ * we change k_getrusage()-like users to rely on this lock
+ * we have to take ->siglock as well.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&p->real_parent->sighand->siglock);
- psig = p->real_parent->signal;
- sig = p->signal;
+ spin_lock_irq(&current->sighand->siglock);
write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
@@ -1073,16 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&p->real_parent->sighand->siglock);
+ spin_unlock_irq(&current->sighand->siglock);
}
- /*
- * Now we are sure this task is interesting, and no other
- * thread can reap it because we its state == DEAD/TRACE.
- */
- read_unlock(&tasklist_lock);
- sched_annotate_sleep();
-
retval = wo->wo_rusage
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
+#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
+ if (is_ftrace_trampoline(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return is_module_text_address(addr);
+ if (is_module_text_address(addr))
+ return 1;
+ return is_ftrace_trampoline(addr);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ca84189cfc2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- mutex_lock(&mapping->i_mmap_mutex);
+ i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
vma_interval_tree_insert_after(tmp, mpnt,
&mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
- mutex_unlock(&mapping->i_mmap_mutex);
+ i_mmap_unlock_write(mapping);
}
/*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b7408759bdf..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
Note that the debugfs filesystem has to be mounted to access
profiling data.
+config ARCH_HAS_GCOV_PROFILE_ALL
+ def_bool n
+
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
+ depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
return i;
}
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
+}
+
/*
* SMP: Our groups are copy-on-write. We can set them safely
* without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
void irq_work_tick(void)
{
- struct llist_head *raised = &__get_cpu_var(raised_list);
+ struct llist_head *raised = this_cpu_ptr(&raised_list);
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(&__get_cpu_var(lazy_list));
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
}
/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6e9a61..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!kexec_on_panic) {
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
- pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ pr_err("Could not allocate swap buffer\n");
goto out_free_control_pages;
}
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance. There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
- int wait = sub_info->wait & ~UMH_KILLABLE;
struct cred *new;
int retval;
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
out:
sub_info->retval = retval;
/* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
- if (wait != UMH_WAIT_PROC)
+ if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
return 0;
do_exit(0);
}
-static int call_helper(void *data)
-{
- /* Worker thread started blocking khelper thread. */
- kmod_thread_locker = current;
- return ____call_usermodehelper(data);
-}
-
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
- /* CLONE_VFORK: wait until the usermode helper has execve'd
- * successfully We need the data structures to stay around
- * until that is done. */
- if (wait == UMH_WAIT_PROC)
+ if (sub_info->wait & UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else {
- pid = kernel_thread(call_helper, sub_info,
- CLONE_VFORK | SIGCHLD);
- /* Worker thread stopped blocking khelper thread. */
- kmod_thread_locker = NULL;
- }
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
goto out;
}
/*
- * Worker thread must not wait for khelper thread at below
- * wait_for_completion() if the thread was created with CLONE_VFORK
- * flag, for khelper thread is already waiting for the thread at
- * wait_for_completion() in do_fork().
- */
- if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
* the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..06f58309fed2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
- .flags = FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
static int kprobe_ftrace_enabled;
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+int __weak arch_check_ftrace_location(struct kprobe *p)
{
- int ret = 0;
unsigned long ftrace_addr;
- /*
- * If the address is located on a ftrace nop, set the
- * breakpoint to the following instruction.
- */
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
return -EINVAL;
#endif
}
+ return 0;
+}
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
+{
+ int ret;
+
+ ret = arch_check_ftrace_location(p);
+ if (ret)
+ return ret;
jump_label_lock();
preempt_disable();
diff --git a/kernel/module.c b/kernel/module.c
index e52a8739361a..3965511ae133 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -98,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
* Protected by module_mutex. */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-int register_module_notifier(struct notifier_block * nb)
+int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);
-int unregister_module_notifier(struct notifier_block * nb)
+int unregister_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_inc(&mod->refcnt);
return 0;
}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}
/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
+ mod->state = MODULE_STATE_GOING;
- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}
unsigned long module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
seq_printf(m, " %lu ", module_refcount(mod));
- /* Always include a trailing , so userspace can differentiate
- between this and the old multi-field proc format. */
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
list_for_each_entry(use, &mod->source_list, source_list) {
printed_something = 1;
seq_printf(m, "%s,", use->source->name);
@@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
if (mod->init != NULL && mod->exit == NULL) {
printed_something = 1;
- seq_printf(m, "[permanent],");
+ seq_puts(m, "[permanent],");
}
if (!printed_something)
- seq_printf(m, "-");
+ seq_puts(m, "-");
}
void __symbol_put(const char *symbol)
@@ -935,7 +918,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -948,11 +931,11 @@ bool try_module_get(struct module *module)
if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;
preempt_enable();
@@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get);
void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}
@@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put);
static inline void print_unload_info(struct seq_file *m, struct module *mod)
{
/* We don't know the usage count, or what modules are using. */
- seq_printf(m, " - -");
+ seq_puts(m, " - -");
}
static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc,
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs,
return 0;
bad_version:
- printk("%s: disagrees about version of symbol %s\n",
+ pr_warn("%s: disagrees about version of symbol %s\n",
mod->name, symname);
return 0;
}
@@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
static inline int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *symname,
- struct module *mod,
+ struct module *mod,
const unsigned long *crc,
const struct module *crc_owner)
{
@@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
-struct module_sect_attr
-{
+struct module_sect_attr {
struct module_attribute mattr;
char *name;
unsigned long address;
};
-struct module_sect_attrs
-{
+struct module_sect_attrs {
struct attribute_group grp;
unsigned int nsections;
struct module_sect_attr attrs[0];
@@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod)
(attr->test && attr->test(mod))) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
++temp_attr;
}
}
@@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
- sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
if (attr->free)
attr->free(mod);
}
@@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1860,7 +1831,12 @@ static void free_module(struct module *mod)
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
@@ -1955,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
- printk("%s: please compile with -fno-common\n",
+ pr_warn("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
break;
@@ -2259,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum)
{
const Elf_Shdr *sec;
@@ -2735,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
* This shouldn't happen with same compiler and binutils
* building all parts of the module.
*/
- printk(KERN_WARNING "%s: has both .ctors and .init_array.\n",
+ pr_warn("%s: has both .ctors and .init_array.\n",
mod->name);
return -EINVAL;
}
@@ -3023,8 +2999,10 @@ static int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
+ /*
+ * Init routine failed: abort. Try to protect us from
+ * buggy refcounters.
+ */
mod->state = MODULE_STATE_GOING;
synchronize_sched();
module_put(mod);
@@ -3202,7 +3180,7 @@ out:
static int unknown_module_param_cb(char *param, char *val, const char *modname)
{
- /* Check for magic 'dyndbg' arg */
+ /* Check for magic 'dyndbg' arg */
int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3352,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
module_deallocate(mod, info);
@@ -3685,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p)
/* Informative for users. */
seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading":
- mod->state == MODULE_STATE_COMING ? "Loading":
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
seq_printf(m, " 0x%pK", mod->module_core);
@@ -3695,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p)
if (mod->taints)
seq_printf(m, " %s", module_flags(mod, buf));
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
return 0;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
- const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_ns *ei;
struct file *file;
+ struct ns_common *ns;
int err;
file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = get_proc_ns(file_inode(file));
- ops = ei->ns_ops;
- if (nstype && (ops->type != nstype))
+ ns = get_proc_ns(file_inode(file));
+ if (nstype && (ns->ops->type != nstype))
goto out;
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
- err = ops->install(new_nsproxy, ei->ns);
+ err = ns->ops->install(new_nsproxy, ns);
if (err) {
free_nsproxy(new_nsproxy);
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
if (args)
vprintk(args->fmt, args->args);
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
print_modules();
dump_stack();
print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
static int __init setup_crash_kexec_post_notifiers(char *s)
{
diff --git a/kernel/params.c b/kernel/params.c
index db97b791390f..0af9b2c4e56c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -603,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
const struct kernel_param *kp,
const char *name)
{
- struct module_param_attrs *new;
- struct attribute **attrs;
- int err, num;
+ struct module_param_attrs *new_mp;
+ struct attribute **new_attrs;
+ unsigned int i;
/* We don't bother calling this with invisible parameters. */
BUG_ON(!kp->perm);
if (!mk->mp) {
- num = 0;
- attrs = NULL;
- } else {
- num = mk->mp->num;
- attrs = mk->mp->grp.attrs;
+ /* First allocation. */
+ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
+ if (!mk->mp)
+ return -ENOMEM;
+ mk->mp->grp.name = "parameters";
+ /* NULL-terminated attribute array. */
+ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+ GFP_KERNEL);
+ /* Caller will cleanup via free_module_param_attrs */
+ if (!mk->mp->grp.attrs)
+ return -ENOMEM;
}
- /* Enlarge. */
- new = krealloc(mk->mp,
- sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
- GFP_KERNEL);
- if (!new) {
- kfree(attrs);
- err = -ENOMEM;
- goto fail;
- }
- /* Despite looking like the typical realloc() bug, this is safe.
- * We *want* the old 'attrs' to be freed either way, and we'll store
- * the new one in the success case. */
- attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
- if (!attrs) {
- err = -ENOMEM;
- goto fail_free_new;
- }
+ /* Enlarge allocations. */
+ new_mp = krealloc(mk->mp,
+ sizeof(*mk->mp) +
+ sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ GFP_KERNEL);
+ if (!new_mp)
+ return -ENOMEM;
+ mk->mp = new_mp;
- /* Sysfs wants everything zeroed. */
- memset(new, 0, sizeof(*new));
- memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
- memset(&attrs[num], 0, sizeof(attrs[num]));
- new->grp.name = "parameters";
- new->grp.attrs = attrs;
+ /* Extra pointer for NULL terminator */
+ new_attrs = krealloc(mk->mp->grp.attrs,
+ sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
+ GFP_KERNEL);
+ if (!new_attrs)
+ return -ENOMEM;
+ mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- sysfs_attr_init(&new->attrs[num].mattr.attr);
- new->attrs[num].param = kp;
- new->attrs[num].mattr.show = param_attr_show;
- new->attrs[num].mattr.store = param_attr_store;
- new->attrs[num].mattr.attr.name = (char *)name;
- new->attrs[num].mattr.attr.mode = kp->perm;
- new->num = num+1;
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
+ mk->mp->attrs[mk->mp->num].param = kp;
+ mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ /* Do not allow runtime DAC changes to make param writable. */
+ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+ mk->mp->num++;
/* Fix up all the pointers, since krealloc can move us */
- for (num = 0; num < new->num; num++)
- new->grp.attrs[num] = &new->attrs[num].mattr.attr;
- new->grp.attrs[num] = NULL;
-
- mk->mp = new;
+ for (i = 0; i < mk->mp->num; i++)
+ mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
+ mk->mp->grp.attrs[mk->mp->num] = NULL;
return 0;
-
-fail_free_new:
- kfree(new);
-fail:
- mk->mp = NULL;
- return err;
}
#ifdef CONFIG_MODULES
static void free_module_param_attrs(struct module_kobject *mk)
{
- kfree(mk->mp->grp.attrs);
+ if (mk->mp)
+ kfree(mk->mp->grp.attrs);
kfree(mk->mp);
mk->mp = NULL;
}
@@ -695,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod,
if (kparam[i].perm == 0)
continue;
err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
- if (err)
+ if (err) {
+ free_module_param_attrs(&mod->mkobj);
return err;
+ }
params = true;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .proc_inum = PROC_PID_INIT_INO,
+ .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+ .ns.ops = &pidns_operations,
+#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -341,6 +344,8 @@ out:
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ put_pid_ns(ns);
+
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_map;
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err)
goto out_free_map;
+ ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
- /* Ignore SIGCHLD causing any terminated children to autoreap */
+ /*
+ * Ignore SIGCHLD causing any terminated children to autoreap.
+ * This speeds up the namespace shutdown, plus see the comment
+ * below.
+ */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
- /* Firstly reap the EXIT_ZOMBIE children we may have. */
+ /*
+ * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+ * sys_wait4() will also block until our children traced from the
+ * parent namespace are detached and become EXIT_DEAD.
+ */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see free_pid().
+ * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * really care, we could reparent them to the global init. We could
+ * exit and reap ->child_reaper even if it is not the last thread in
+ * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * pid_ns can not go away until proc_kill_sb() drops the reference.
+ *
+ * But this ns can also have other tasks injected by setns()+fork().
+ * Again, ignoring the user visible semantics we do not really need
+ * to wait until they are all reaped, but they can be reparented to
+ * us and thus we need to ensure that pid->child_reaper stays valid
+ * until they all go away. See free_pid()->wake_up_process().
+ *
+ * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+ * if reparented.
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static void *pidns_get(struct task_struct *task)
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
+static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
get_pid_ns(ns);
rcu_read_unlock();
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void pidns_put(void *ns)
+static void pidns_put(struct ns_common *ns)
{
- put_pid_ns(ns);
+ put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = ns;
+ struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
return 0;
}
-static unsigned int pidns_inum(void *ns)
-{
- struct pid_namespace *pid_ns = ns;
- return pid_ns->proc_inum;
-}
-
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
- .inum = pidns_inum,
};
static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..6e7708c2c21f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
+ select PM_RUNTIME
config PM_SLEEP_SMP
def_bool y
@@ -131,7 +132,6 @@ config PM_WAKELOCKS_GC
config PM_RUNTIME
bool "Run-time PM core functionality"
- depends on !IA64_HP_SIM
---help---
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states at run time (or autosuspended) after a specified
@@ -298,14 +298,9 @@ config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
-config PM_GENERIC_DOMAINS_RUNTIME
- def_bool y
- depends on PM_RUNTIME && PM_GENERIC_DOMAINS
-
config PM_GENERIC_DOMAINS_OF
def_bool y
depends on PM_GENERIC_DOMAINS && OF
config CPU_PM
bool
- depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <linux/ktime.h>
#include <trace/events/power.h>
#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
* @nr_pages: Number of memory pages processed between @start and @stop.
* @msg: Additional diagnostic message to print.
*/
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
- unsigned nr_pages, char *msg)
+void swsusp_show_speed(ktime_t start, ktime_t stop,
+ unsigned nr_pages, char *msg)
{
+ ktime_t diff;
u64 elapsed_centisecs64;
unsigned int centisecs;
unsigned int k;
unsigned int kps;
- elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
- /*
- * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
- * it is obvious enough for what went wrong.
- */
- do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ diff = ktime_sub(stop, start);
+ elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
struct timeval;
/* kernel/power/swsusp.c */
-extern void swsusp_show_speed(struct timeval *, struct timeval *,
- unsigned int, char *);
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/compiler.h>
+#include <linux/ktime.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
struct zone *zone;
unsigned long saveable, size, max_size, count, highmem, pages = 0;
unsigned long alloc, save_highmem, pages_highmem, avail_normal;
- struct timeval start, stop;
+ ktime_t start, stop;
int error;
printk(KERN_INFO "PM: Preallocating image memory... ");
- do_gettimeofday(&start);
+ start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
free_unnecessary_pages();
out:
- do_gettimeofday(&stop);
+ stop = ktime_get();
printk(KERN_CONT "done (allocated %lu pages)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Allocated");
+ swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
#include <linux/atomic.h>
#include <linux/kthread.h>
#include <linux/crc32.h>
+#include <linux/ktime.h>
#include "power.h"
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
size_t off;
unsigned thr, run_threads, nr_threads;
unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
out_finish:
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
{
unsigned int m;
int ret = 0;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
struct bio *bio;
int err2;
unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
return ret;
}
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int eof = 0;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
unsigned nr_pages;
size_t off;
unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
wait_event(crc->done, atomic_read(&crc->stop));
atomic_set(&crc->stop, 0);
}
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret) {
printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
}
}
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
kthread_stop(data[thr].thr);
vfree(data);
}
- if (page) vfree(page);
+ vfree(page);
return ret;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
-/* Deferred messaged from sched code are marked by this special level */
-#define SCHED_MESSAGE_LOGLEVEL -2
-
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-static int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, bool from_file)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
- static int saved_console_loglevel = -1;
+ static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
- if (saved_console_loglevel == -1)
+ if (saved_console_loglevel == LOGLEVEL_DEFAULT)
saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
/* Enable logging to console */
case SYSLOG_ACTION_CONSOLE_ON:
- if (saved_console_loglevel != -1) {
+ if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
console_loglevel = saved_console_loglevel;
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
}
break;
/* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
error = 0;
break;
/* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
int printed_len = 0;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
- static volatile unsigned int logbuf_cpu = UINT_MAX;
+ static unsigned int logbuf_cpu = UINT_MAX;
- if (level == SCHED_MESSAGE_LOGLEVEL) {
- level = -1;
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
in_sched = true;
}
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *end_of_header = printk_skip_level(text);
switch (kern_level) {
case '0' ... '7':
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
+ /* fallthrough */
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
}
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, -1, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
}
EXPORT_SYMBOL(printk_emit);
+int vprintk_default(const char *fmt, va_list args)
+{
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ r = vkdb_printf(fmt, args);
+ return r;
+ }
+#endif
+ r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
+ printk_func_t vprintk_func;
va_list args;
int r;
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
va_start(args, fmt);
- r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+
+ /*
+ * If a caller overrides the per_cpu printk_func, then it needs
+ * to disable preemption when calling printk(). Otherwise
+ * the printk_func should be set to the default. No need to
+ * disable preemption here.
+ */
+ vprintk_func = this_cpu_read(printk_func);
+ r = vprintk_func(fmt, args);
+
va_end(args);
return r;
@@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
+
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
-void early_vprintk(const char *fmt, va_list ap)
-{
- if (early_console) {
- char buf[512];
- int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
- early_console->write(early_console, buf, n);
- }
-}
-
asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
+ char buf[512];
+ int n;
+
+ if (!early_console)
+ return;
va_start(ap, fmt);
- early_vprintk(fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
+
+ early_console->write(early_console, buf, n);
}
#endif
@@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...)
preempt_disable();
va_start(args, fmt);
- r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/*
* Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
- * and reacquire the lock.
+ * for writing.
*/
-void exit_ptrace(struct task_struct *tracer)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
struct task_struct *p, *n;
- LIST_HEAD(ptrace_dead);
-
- if (likely(list_empty(&tracer->ptraced)))
- return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
if (__ptrace_detach(tracer, p))
- list_add(&p->ptrace_entry, &ptrace_dead);
- }
-
- write_unlock_irq(&tasklist_lock);
- BUG_ON(!list_empty(&tracer->ptraced));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
+ list_add(&p->ptrace_entry, dead);
}
-
- write_lock_irq(&tasklist_lock);
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * resource cgroups
- *
- * Copyright 2007 OpenVZ SWsoft Inc
- *
- * Author: Pavel Emelianov <xemul@openvz.org>
- *
- */
-
-#include <linux/types.h>
-#include <linux/parser.h>
-#include <linux/fs.h>
-#include <linux/res_counter.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-
-void res_counter_init(struct res_counter *counter, struct res_counter *parent)
-{
- spin_lock_init(&counter->lock);
- counter->limit = RES_COUNTER_MAX;
- counter->soft_limit = RES_COUNTER_MAX;
- counter->parent = parent;
-}
-
-static u64 res_counter_uncharge_locked(struct res_counter *counter,
- unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
-static int res_counter_charge_locked(struct res_counter *counter,
- unsigned long val, bool force)
-{
- int ret = 0;
-
- if (counter->usage + val > counter->limit) {
- counter->failcnt++;
- ret = -ENOMEM;
- if (!force)
- return ret;
- }
-
- counter->usage += val;
- if (counter->usage > counter->max_usage)
- counter->max_usage = counter->usage;
- return ret;
-}
-
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at, bool force)
-{
- int ret, r;
- unsigned long flags;
- struct res_counter *c, *u;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val, force);
- spin_unlock(&c->lock);
- if (r < 0 && !ret) {
- ret = r;
- *limit_fail_at = c;
- if (!force)
- break;
- }
- }
-
- if (ret < 0 && !force) {
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-
-u64 res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
-{
- unsigned long flags;
- struct res_counter *c;
- u64 ret = 0;
-
- local_irq_save(flags);
- for (c = counter; c != top; c = c->parent) {
- u64 r;
- spin_lock(&c->lock);
- r = res_counter_uncharge_locked(c, val);
- if (c == counter)
- ret = r;
- spin_unlock(&c->lock);
- }
- local_irq_restore(flags);
- return ret;
-}
-
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
- return res_counter_uncharge_until(counter, NULL, val);
-}
-
-static inline unsigned long long *
-res_counter_member(struct res_counter *counter, int member)
-{
- switch (member) {
- case RES_USAGE:
- return &counter->usage;
- case RES_MAX_USAGE:
- return &counter->max_usage;
- case RES_LIMIT:
- return &counter->limit;
- case RES_FAILCNT:
- return &counter->failcnt;
- case RES_SOFT_LIMIT:
- return &counter->soft_limit;
- };
-
- BUG();
- return NULL;
-}
-
-ssize_t res_counter_read(struct res_counter *counter, int member,
- const char __user *userbuf, size_t nbytes, loff_t *pos,
- int (*read_strategy)(unsigned long long val, char *st_buf))
-{
- unsigned long long *val;
- char buf[64], *s;
-
- s = buf;
- val = res_counter_member(counter, member);
- if (read_strategy)
- s += read_strategy(*val, s);
- else
- s += sprintf(s, "%llu\n", *val);
- return simple_read_from_buffer((void __user *)userbuf, nbytes,
- pos, buf, s - buf);
-}
-
-#if BITS_PER_LONG == 32
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&counter->lock, flags);
- ret = *res_counter_member(counter, member);
- spin_unlock_irqrestore(&counter->lock, flags);
-
- return ret;
-}
-#else
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- return *res_counter_member(counter, member);
-}
-#endif
-
-int res_counter_memparse_write_strategy(const char *buf,
- unsigned long long *resp)
-{
- char *end;
- unsigned long long res;
-
- /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
- if (*buf == '-') {
- int rc = kstrtoull(buf + 1, 10, &res);
-
- if (rc)
- return rc;
- if (res != 1)
- return -EINVAL;
- *resp = RES_COUNTER_MAX;
- return 0;
- }
-
- res = memparse(buf, &end);
- if (*end != '\0')
- return -EINVAL;
-
- if (PAGE_ALIGN(res) >= res)
- res = PAGE_ALIGN(res);
- else
- res = RES_COUNTER_MAX;
-
- *resp = res;
-
- return 0;
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb398c0c5f08..b5797b78add6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ ppid = 0;
rcu_read_lock();
- ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
task_pid_nr(p), ppid,
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
}
EXPORT_SYMBOL_GPL(print_stack_trace);
+int snprint_stack_trace(char *buf, size_t size,
+ struct stack_trace *trace, int spaces)
+{
+ int i;
+ unsigned long ip;
+ int generated;
+ int total = 0;
+
+ if (WARN_ON(!trace->entries))
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ ip = trace->entries[i];
+ generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+ 1 + spaces, ' ', (void *) ip, (void *) ip);
+
+ total += generated;
+
+ /* Assume that generated isn't a negative number */
+ if (generated >= size) {
+ buf += size;
+ size = 0;
+ } else {
+ buf += generated;
+ size -= generated;
+ }
+ }
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
+
/*
* Architectures that do not implement save_stack_trace_tsk or
* save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
/* mmu depending weak syscall entries */
cond_syscall(sys_mprotect);
@@ -224,3 +226,6 @@ cond_syscall(sys_seccomp);
/* access BPF programs and maps */
cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif
#ifdef CONFIG_KEXEC
{
@@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
{}
};
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, f.file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
/* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq);
- /* Add clocksource to the clcoksource list */
+ /* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1f4356037a7d..4d54b7540585 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}
/*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 65015ff2f07c..6390517e77d4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -741,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n)
return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
+EXPORT_SYMBOL(nsecs_to_jiffies64);
/**
* nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
+ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
endif
ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
-static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq,
- "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
- MAJOR(t->device), MINOR(t->device), iter->cpu,
- secs, nsec_rem, iter->ent->pid, act, rwbs);
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static int blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
}
-static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
{
const unsigned char *pdu_buf;
int pdu_len;
- int i, end, ret;
+ int i, end;
pdu_buf = pdu_start(ent);
pdu_len = te_blk_io_trace(ent)->pdu_len;
if (!pdu_len)
- return 1;
+ return;
/* find the last zero that needs to be printed */
for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
break;
end++;
- if (!trace_seq_putc(s, '('))
- return 0;
+ trace_seq_putc(s, '(');
for (i = 0; i < pdu_len; i++) {
- ret = trace_seq_printf(s, "%s%02x",
- i == 0 ? "" : " ", pdu_buf[i]);
- if (!ret)
- return ret;
+ trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
/*
* stop when the rest is just zeroes and indicate so
* with a ".." appended
*/
- if (i == end && end != pdu_len - 1)
- return trace_seq_puts(s, " ..) ");
+ if (i == end && end != pdu_len - 1) {
+ trace_seq_puts(s, " ..) ");
+ return;
+ }
}
- return trace_seq_puts(s, ") ");
+ trace_seq_puts(s, ") ");
}
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = trace_seq_printf(s, "%u ", t_bytes(ent));
- if (!ret)
- return 0;
- ret = blk_log_dump_pdu(s, ent);
- if (!ret)
- return 0;
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "%u ", t_bytes(ent));
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%s]\n",
+ trace_seq_printf(s, "%llu + %u [%s]\n",
t_sector(ent), t_sec(ent), cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ else
+ trace_seq_printf(s, "[%s]\n", cmd);
}
}
-static int blk_log_with_error(struct trace_seq *s,
+static void blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = blk_log_dump_pdu(s, ent);
- if (ret)
- return trace_seq_printf(s, "[%d]\n", t_error(ent));
- return 0;
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%d]\n",
- t_sector(ent),
- t_sec(ent), t_error(ent));
- return trace_seq_printf(s, "%llu [%d]\n",
- t_sector(ent), t_error(ent));
+ trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ else
+ trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
}
}
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
{
struct blk_io_trace_remap r = { .device_from = 0, };
get_pdu_remap(ent, &r);
- return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
- t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
}
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "[%s]\n", cmd);
}
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
}
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
}
-static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
{
- int ret;
const struct blk_io_trace *t = te_blk_io_trace(ent);
- ret = trace_seq_putmem(s, t + 1, t->pdu_len);
- if (ret)
- return trace_seq_putc(s, '\n');
- return ret;
+ trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putc(s, '\n');
}
/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- int (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
- int ret;
bool long_act;
blk_log_action_t *log_action;
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
- ret = log_action(iter, long_act ? "message" : "m");
- if (ret)
- ret = blk_log_msg(s, iter->ent);
- goto out;
+ log_action(iter, long_act ? "message" : "m");
+ blk_log_msg(s, iter->ent);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
- ret = trace_seq_printf(s, "Unknown action %x\n", what);
+ trace_seq_printf(s, "Unknown action %x\n", what);
else {
- ret = log_action(iter, what2act[what].act[long_act]);
- if (ret)
- ret = what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act]);
+ what2act[what].print(s, iter->ent);
}
-out:
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
return print_one_line(iter, false);
}
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
.time = iter->ts,
};
- if (!trace_seq_putmem(s, &old, offset))
- return 0;
- return trace_seq_putmem(s, &t->sector,
- sizeof(old) - offset + t->pdu_len);
+ trace_seq_putmem(s, &old, offset);
+ trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
}
static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- return blk_trace_synthesize_old_trace(iter) ?
- TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ blk_trace_synthesize_old_trace(iter);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints();
- spin_lock_irq(&running_trace_lock);
- list_del(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt);
return 0;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31c90fec4158..929a733d302e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
return ret;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
static int __register_ftrace_function(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+ /* The control_ops needs the trampoline update */
+ ops = &control_ops;
} else
add_ftrace_ops(&ftrace_ops_list, ops);
+ ftrace_update_trampoline(ops);
+
if (ftrace_enabled)
update_ftrace_function();
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " Function "
- "Hit Time Avg s^2\n"
- " -------- "
- "--- ---- --- ---\n");
+ seq_puts(m, " Function "
+ "Hit Time Avg s^2\n"
+ " -------- "
+ "--- ---- --- ---\n");
#else
- seq_printf(m, " Function Hit\n"
- " -------- ---\n");
+ seq_puts(m, " Function Hit\n"
+ " -------- ---\n");
#endif
return 0;
}
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " ");
+ seq_puts(m, " ");
avg = rec->time;
do_div(avg, rec->counter);
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
FTRACE_OPS_FL_INITIALIZED,
};
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ struct ftrace_ops *op;
+ bool ret = false;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they are freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This is to check for dynamically allocated trampolines.
+ * Trampolines that are in kernel text will have
+ * core_kernel_text() return true.
+ */
+ if (op->trampoline && op->trampoline_size)
+ if (addr >= op->trampoline &&
+ addr < op->trampoline + op->trampoline_size) {
+ ret = true;
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash);
+
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
+ int ret;
int i;
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
/*
* If the new source is empty, just free dst and assign it
* the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
update:
+ /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+ if (enable) {
+ /* IPMODIFY should be updated only when filter_hash updating */
+ ret = ftrace_hash_ipmodify_update(ops, new_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return ret;
+ }
+ }
+
/*
* Remove the current set, update the hash and add
* them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
ftrace_hash_rec_update_modify(ops, filter_hash, 1);
}
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ * - If the hash is EMPTY_HASH, it hits nothing
+ * - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec, *end = NULL;
+ int in_old, in_new;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ /*
+ * Since the IPMODIFY is a very address sensitive action, we do not
+ * allow ftrace_ops to set all functions to new hash.
+ */
+ if (!new_hash || !old_hash)
+ return -EINVAL;
+
+ /* Update rec->flags */
+ do_for_each_ftrace_rec(pg, rec) {
+ /* We need to update only differences of filter_hash */
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new) {
+ /* New entries must ensure no others are using it */
+ if (rec->flags & FTRACE_FL_IPMODIFY)
+ goto rollback;
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } else /* Removed entry */
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+ return 0;
+
+rollback:
+ end = rec;
+
+ /* Roll back what we did above */
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec == end)
+ goto err_out;
+
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new)
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ else
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+err_out:
+ return -EBUSY;
+}
+
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(old_hash))
+ old_hash = NULL;
+
+ if (ftrace_hash_empty(new_hash))
+ new_hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+
/**
* ftrace_bug - report and shutdown function tracer
* @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
+ * @rec: The record that failed
*
* The arch code that enables or disables the function tracing
* can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
* EINVAL - if what is read at @ip is not what was expected
* EPERM - if the problem happens on writting to the @ip address
*/
-void ftrace_bug(int failed, unsigned long ip)
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
+ unsigned long ip = rec ? rec->ip : 0;
+
switch (failed) {
case -EFAULT:
FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ if (rec) {
+ struct ftrace_ops *ops = NULL;
+
+ pr_info("ftrace record flags: %lx\n", rec->flags);
+ pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ pr_cont("\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ pr_cont("\ttramp: ERROR!");
+
+ }
+ ip = ftrace_get_addr_curr(rec);
+ pr_cont(" expected tramp: %lx\n", ip);
+ }
}
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
do_for_each_ftrace_rec(pg, rec) {
failed = __ftrace_replace_code(rec, enable);
if (failed) {
- ftrace_bug(failed, rec->ip);
+ ftrace_bug(failed, rec);
/* Stop processing */
return;
}
@@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
- unsigned long ip;
int ret;
- ip = rec->ip;
-
if (unlikely(ftrace_disabled))
return 0;
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
- ftrace_bug(ret, ip);
+ ftrace_bug(ret, rec);
return 0;
}
return 1;
@@ -2320,6 +2509,10 @@ static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
+
static void control_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
@@ -2369,6 +2562,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
*/
ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+ ret = ftrace_hash_ipmodify_enable(ops);
+ if (ret < 0) {
+ /* Rollback registration process */
+ __unregister_ftrace_function(ops);
+ ftrace_start_up--;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ return ret;
+ }
+
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
@@ -2397,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
+ /* Disabling ipmodify never fails */
+ ftrace_hash_ipmodify_disable(ops);
ftrace_hash_rec_disable(ops, 1);
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2471,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
schedule_on_each_cpu(ftrace_sync);
+ arch_ftrace_trampoline_free(ops);
+
if (ops->flags & FTRACE_OPS_FL_CONTROL)
control_ops_free(ops);
}
@@ -2623,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
- ftrace_bug(failed, p->ip);
+ ftrace_bug(failed, p);
}
}
}
@@ -2948,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&ftrace_lock);
}
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return NULL;
+}
+
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ void *ptr;
+
+ ptr = arch_ftrace_trampoline_func(ops, rec);
+ if (ptr)
+ seq_printf(m, " ->%pS", ptr);
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -2958,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
else
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2971,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- seq_printf(m, " (%ld)%s",
+ struct ftrace_ops *ops = NULL;
+
+ seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
- struct ftrace_ops *ops;
-
ops = ftrace_find_tramp_ops_any(rec);
if (ops)
seq_printf(m, "\ttramp: %pS",
(void *)ops->trampoline);
else
- seq_printf(m, "\ttramp: ERROR!");
+ seq_puts(m, "\ttramp: ERROR!");
+
}
+ add_trampoline_func(m, ops, rec);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -3020,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
if (iter) {
iter->pg = ftrace_pages_start;
@@ -3975,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
+
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4183,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v)
struct ftrace_graph_data *fgd = m->private;
if (fgd->table == ftrace_graph_funcs)
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
else
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4696,6 +4921,32 @@ void __init ftrace_init(void)
ftrace_disabled = 1;
}
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+ /* Currently, only non dynamic ops can have a trampoline */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ return;
+#endif
+
+ arch_ftrace_update_trampoline(ops);
+}
+
#else
static struct ftrace_ops global_ops = {
@@ -4738,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 1;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE */
__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5075,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v)
const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
if (v == (void *)1) {
- seq_printf(m, "no pid\n");
+ seq_puts(m, "no pid\n");
return 0;
}
if (fpid->pid == ftrace_swapper_pid)
- seq_printf(m, "swapper tasks\n");
+ seq_puts(m, "swapper tasks\n");
else
seq_printf(m, "%u\n", pid_vnr(fpid->pid));
@@ -5293,6 +5548,7 @@ static struct ftrace_ops graph_ops = {
FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
#endif
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
@@ -5522,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
update_function_graph_func();
ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-
out:
mutex_unlock(&ftrace_lock);
return ret;
@@ -5543,6 +5798,17 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * Function graph does not allocate the trampoline, but
+ * other global_ops do. We need to reset the ALLOC_TRAMP flag
+ * if one was used.
+ */
+ global_ops.trampoline = save_global_trampoline;
+ if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+ global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
+
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a56e07c8d15b..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
*/
int ring_buffer_print_entry_header(struct trace_seq *s)
{
- int ret;
-
- ret = trace_seq_puts(s, "# compressed entry header\n");
- ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
- ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
- ret = trace_seq_puts(s, "\tarray : 32 bits\n");
- ret = trace_seq_putc(s, '\n');
- ret = trace_seq_printf(s, "\tpadding : type == %d\n",
- RINGBUF_TYPE_PADDING);
- ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
- RINGBUF_TYPE_TIME_EXTEND);
- ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
- RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ trace_seq_puts(s, "# compressed entry header\n");
+ trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ trace_seq_puts(s, "\tarray : 32 bits\n");
+ trace_seq_putc(s, '\n');
+ trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
- int ret;
-
- ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
- return ret;
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
+
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
}
struct rb_irq_work {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92f4a6cee172..2e767972e99c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
*/
bool __read_mostly tracing_selftest_disabled;
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
+
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
{ }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
static int __init stop_trace_on_warning(char *str)
{
- __disable_trace_on_warning = 1;
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ __disable_trace_on_warning = 1;
return 1;
}
-__setup("traceoff_on_warning=", stop_trace_on_warning);
+__setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
}
__setup("trace_clock=", set_trace_boot_clock);
+static int __init set_tracepoint_printk(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ tracepoint_printk = 1;
+ return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
unsigned long long ns2usecs(cycle_t nsec)
{
@@ -938,19 +950,20 @@ out:
return ret;
}
+/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (s->len <= s->readpos)
+ if (trace_seq_used(s) <= s->seq.readpos)
return -EBUSY;
- len = s->len - s->readpos;
+ len = trace_seq_used(s) - s->seq.readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->readpos, cnt);
+ memcpy(buf, s->buffer + s->seq.readpos, cnt);
- s->readpos += cnt;
+ s->seq.readpos += cnt;
return cnt;
}
@@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void)
pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
pr_warning("** **\n");
pr_warning("** This means that this is a DEBUG kernel and it is **\n");
- pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** unsafe for production use. **\n");
pr_warning("** **\n");
pr_warning("** If you see this message and you are not debugging **\n");
pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
goto out;
}
- len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- if (len > TRACE_BUF_SIZE)
- goto out;
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
@@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, tbuffer, len);
- entry->buf[len] = '\0';
+ memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | | |\n");
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
+ "# | | | | |\n");
}
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n");
- seq_puts(m, "# / _----=> need-resched\n");
- seq_puts(m, "# | / _---=> hardirq/softirq\n");
- seq_puts(m, "# || / _--=> preempt-depth\n");
- seq_puts(m, "# ||| / delay\n");
- seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | |||| | |\n");
+ seq_puts(m, "# _-----=> irqs-off\n"
+ "# / _----=> need-resched\n"
+ "# | / _---=> hardirq/softirq\n"
+ "# || / _--=> preempt-depth\n"
+ "# ||| / delay\n"
+ "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
+ "# | | | |||| | |\n");
}
void
@@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- if (!trace_print_lat_context(iter))
- goto partial;
- } else {
- if (!trace_print_context(iter))
- goto partial;
- }
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ trace_print_lat_context(iter);
+ else
+ trace_print_context(iter);
}
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
if (event)
return event->funcs->trace(iter, sym_flags, event);
- if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (!trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, iter->ts))
- goto partial;
- }
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
event = ftrace_find_event(entry->type);
if (event)
return event->funcs->raw(iter, 0, event);
- if (!trace_seq_printf(s, "%d ?\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "%d ?\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+ SEQ_PUT_HEX_FIELD(s, entry->pid);
+ SEQ_PUT_HEX_FIELD(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
return ret;
}
- SEQ_PUT_FIELD_RET(s, newline);
+ SEQ_PUT_FIELD(s, newline);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
- SEQ_PUT_FIELD_RET(s, iter->ts);
+ SEQ_PUT_FIELD(s, entry->pid);
+ SEQ_PUT_FIELD(s, iter->cpu);
+ SEQ_PUT_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
- if (iter->lost_events &&
- !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events))
- return TRACE_TYPE_PARTIAL_LINE;
+ if (iter->lost_events) {
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
@@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
{
if (!ftrace_is_dead())
return;
- seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
+ seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
+ "# MAY BE MISSING FUNCTION EVENTS\n");
}
#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer.\n");
- seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void show_snapshot_percpu_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
#else
- seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
- seq_printf(m, "# Must use main snapshot file to allocate.\n");
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
#endif
- seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
if (iter->tr->allocated_snapshot)
- seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
else
- seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
- seq_printf(m, "# Snapshot commands:\n");
+ seq_puts(m, "# Snapshot commands:\n");
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
show_snapshot_main_help(m);
else
@@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
if (!t)
return 0;
- seq_printf(m, "%s", t->name);
+ seq_puts(m, t->name);
if (t->next)
seq_putc(m, ' ');
else
@@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
goto out;
}
+ trace_seq_init(&iter->seq);
+
/*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
@@ -4507,18 +4520,18 @@ waitagain:
trace_access_lock(iter->cpu_file);
while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
- int len = iter->seq.len;
+ int save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
/* don't print partial lines */
- iter->seq.len = len;
+ iter->seq.seq.len = save_len;
break;
}
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
- if (iter->seq.len >= cnt)
+ if (trace_seq_used(&iter->seq) >= cnt)
break;
/*
@@ -4534,7 +4547,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.readpos >= iter->seq.len)
+ if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -4568,20 +4581,33 @@ static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
size_t count;
+ int save_len;
int ret;
/* Seq buffer is page-sized, exactly what we need. */
for (;;) {
- count = iter->seq.len;
+ save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
- count = iter->seq.len - count;
- if (rem < count) {
- rem = 0;
- iter->seq.len -= count;
+
+ if (trace_seq_has_overflowed(&iter->seq)) {
+ iter->seq.seq.len = save_len;
break;
}
+
+ /*
+ * This should not be hit, because it should only
+ * be set if the iter->seq overflowed. But check it
+ * anyway to be safe.
+ */
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- iter->seq.len -= count;
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ count = trace_seq_used(&iter->seq) - save_len;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.seq.len = save_len;
break;
}
@@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- iter->seq.len);
+ trace_seq_used(&iter->seq));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = iter->seq.len;
+ spd.partial[i].len = trace_seq_used(&iter->seq);
trace_seq_init(&iter->seq);
}
@@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "read events: %ld\n", cnt);
- count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+ count = simple_read_from_buffer(ubuf, count, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%ps:", (void *)ip);
- seq_printf(m, "snapshot");
+ seq_puts(m, "snapshot");
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
@@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6631,11 +6658,19 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= TRACE_MAX_PRINT)
- s->len = TRACE_MAX_PRINT;
+ if (s->seq.len >= TRACE_MAX_PRINT)
+ s->seq.len = TRACE_MAX_PRINT;
+
+ /*
+ * More paranoid code. Although the buffer size is set to
+ * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+ * an extra layer of protection.
+ */
+ if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+ s->seq.len = s->seq.size - 1;
/* should be zero ended, but we are paranoid. */
- s->buffer[s->len] = 0;
+ s->buffer[s->seq.len] = 0;
printk(KERN_TRACE "%s", s->buffer);
@@ -6874,6 +6909,19 @@ out:
return ret;
}
+void __init trace_init(void)
+{
+ if (tracepoint_printk) {
+ tracepoint_print_iter =
+ kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ if (WARN_ON(!tracepoint_print_iter))
+ tracepoint_printk = 0;
+ }
+ tracer_alloc_buffers();
+ init_ftrace_syscalls();
+ trace_event_init();
+}
+
__init static int clear_boot_tracer(void)
{
/*
@@ -6893,6 +6941,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-early_initcall(tracer_alloc_buffers);
fs_initcall(tracer_init_debugfs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
#include <linux/compiler.h>
+#include <linux/trace_seq.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc);
-
-void tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *cur,
- unsigned long flags, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
-void tracing_sched_switch_assign_trace(struct trace_array *tr);
-void tracing_stop_sched_switch_record(void);
-void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
+extern char trace_find_mark(unsigned long long duration);
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
-extern enum print_line_t
+extern void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
#define perf_ftrace_event_register NULL
#endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+#else
+static inline void __init trace_event_init(void) { }
+#endif
+
+extern struct trace_iterator *tracepoint_print_iter;
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+
+ return trace_handle_return(&iter->seq);
}
static void branch_print_header(struct seq_file *s)
{
seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
- " FUNC:FILE:LINE\n");
- seq_puts(s, "# | | | | | "
- " |\n");
+ " FUNC:FILE:LINE\n"
+ "# | | | | | "
+ " |\n");
}
static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
static int annotated_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " correct incorrect % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
- seq_printf(m, " X ");
+ seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
static int all_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " miss hit %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " miss hit % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..366a78a3e61e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
}
EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+ struct ftrace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ if (!iter)
+ return;
+
+ event_call = fbuffer->ftrace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->ftrace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
{
+ if (tracepoint_printk)
+ output_printk(fbuffer);
+
event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dir->d_subdirs, d_child) {
if (child->d_inode) /* probably unneeded */
child->d_inode->i_private = NULL;
}
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
- seq_printf(m, "format:\n");
+ seq_puts(m, "format:\n");
return 0;
case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_unlock(&event_mutex);
if (file)
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
print_subsystem_event_filter(system, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
func(s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
ftrace_event_name(data->file->event_call));
if (data->count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", data->count);
@@ -2477,8 +2512,14 @@ static __init int event_trace_init(void)
#endif
return 0;
}
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
OP_GT,
OP_GE,
OP_BAND,
+ OP_NOT,
OP_NONE,
OP_OPEN_PAREN,
};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
{ OP_GT, ">", 5 },
{ OP_GE, ">=", 5 },
{ OP_BAND, "&", 6 },
+ { OP_NOT, "!", 6 },
{ OP_NONE, "OP_NONE", 0 },
{ OP_OPEN_PAREN, "(", 0 },
};
@@ -85,6 +87,7 @@ enum {
FILT_ERR_MISSING_FIELD,
FILT_ERR_INVALID_FILTER,
FILT_ERR_IP_FIELD_ONLY,
+ FILT_ERR_ILLEGAL_NOT_OP,
};
static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
"Missing field name and/or value",
"Meaningless filter expression",
"Only 'ip' field is supported for function trace",
+ "Illegal use of '!'",
};
struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
int index;
};
+/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
break; \
} \
\
- return match; \
+ return !!match == !pred->not; \
}
#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
if (!WARN_ON_ONCE(!pred->fn))
match = pred->fn(pred, rec);
if (!!match == type)
- return match;
+ break;
}
- return match;
+ /* If not of not match is equal to not of not, then it is a match */
+ return !!match == !op->not;
}
struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
* then this op can be folded.
*/
if (left->index & FILTER_PRED_FOLD &&
- (left->op == dest->op ||
+ ((left->op == dest->op && !left->not) ||
left->left == FILTER_PRED_INVALID) &&
right->index & FILTER_PRED_FOLD &&
- (right->op == dest->op ||
+ ((right->op == dest->op && !right->not) ||
right->left == FILTER_PRED_INVALID))
dest->index |= FILTER_PRED_FOLD;
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
}
if (pred->op == OP_NE)
- pred->not = 1;
+ pred->not ^= 1;
pred->fn = fn;
return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
continue;
}
+ if (elt->op == OP_NOT) {
+ if (!n_preds || operand1 || operand2) {
+ parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ if (!dry_run)
+ filter->preds[n_preds - 1].not ^= 1;
+ continue;
+ }
+
if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
{
long count = (long)data;
- seq_printf(m, "%s", name);
+ seq_puts(m, name);
if (count == -1)
seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
if (filter_str)
seq_printf(m, " if %s\n", filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
if (data->filter_str)
seq_printf(m, " if %s\n", data->filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static int update_count(void **data)
+static void update_traceon_count(void **data, bool on)
{
- unsigned long *count = (long *)data;
+ long *count = (long *)data;
+ long old_count = *count;
- if (!*count)
- return 0;
+ /*
+ * Tracing gets disabled (or enabled) once per count.
+ * This function can be called at the same time on multiple CPUs.
+ * It is fine if both disable (or enable) tracing, as disabling
+ * (or enabling) the second time doesn't do anything as the
+ * state of the tracer is already disabled (or enabled).
+ * What needs to be synchronized in this case is that the count
+ * only gets decremented once, even if the tracer is disabled
+ * (or enabled) twice, as the second one is really a nop.
+ *
+ * The memory barriers guarantee that we only decrement the
+ * counter once. First the count is read to a local variable
+ * and a read barrier is used to make sure that it is loaded
+ * before checking if the tracer is in the state we want.
+ * If the tracer is not in the state we want, then the count
+ * is guaranteed to be the old count.
+ *
+ * Next the tracer is set to the state we want (disabled or enabled)
+ * then a write memory barrier is used to make sure that
+ * the new state is visible before changing the counter by
+ * one minus the old counter. This guarantees that another CPU
+ * executing this code will see the new state before seeing
+ * the new counter value, and would not do anything if the new
+ * counter is seen.
+ *
+ * Note, there is no synchronization between this and a user
+ * setting the tracing_on file. But we currently don't care
+ * about that.
+ */
+ if (!old_count)
+ return;
- if (*count != -1)
- (*count)--;
+ /* Make sure we see count before checking tracing state */
+ smp_rmb();
- return 1;
+ if (on == !!tracing_is_on())
+ return;
+
+ if (on)
+ tracing_on();
+ else
+ tracing_off();
+
+ /* unlimited? */
+ if (old_count == -1)
+ return;
+
+ /* Make sure tracing state is visible before updating count */
+ smp_wmb();
+
+ *count = old_count - 1;
}
static void
ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_on();
+ update_traceon_count(data, 1);
}
static void
ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_off();
+ update_traceon_count(data, 0);
}
static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
static void
ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
+ long *count = (long *)data;
+ long old_count;
+ long new_count;
- if (update_count(data))
- trace_dump_stack(STACK_SKIP);
+ /*
+ * Stack traces should only execute the number of times the
+ * user specified in the counter.
+ */
+ do {
+
+ if (!tracing_is_on())
+ return;
+
+ old_count = *count;
+
+ if (!old_count)
+ return;
+
+ /* unlimited? */
+ if (old_count == -1) {
+ trace_dump_stack(STACK_SKIP);
+ return;
+ }
+
+ new_count = old_count - 1;
+ new_count = cmpxchg(count, old_count, new_count);
+ if (new_count == old_count)
+ trace_dump_stack(STACK_SKIP);
+
+ } while (new_count != old_count);
+}
+
+static int update_count(void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return 0;
+
+ if (*count != -1)
+ (*count)--;
+
+ return 1;
}
static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
};
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags);
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
static int max_bytes_for_cpu;
-static enum print_line_t
-print_graph_cpu(struct trace_seq *s, int cpu)
+static void print_graph_cpu(struct trace_seq *s, int cpu)
{
- int ret;
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
}
#define TRACE_GRAPH_PROCINFO_LENGTH 14
-static enum print_line_t
-print_graph_proc(struct trace_seq *s, pid_t pid)
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
- int ret;
int len;
int i;
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
/* First spaces to align center */
- for (i = 0; i < spaces / 2; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < spaces / 2; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s-%s", comm, pid_str);
/* Last spaces to align center */
- for (i = 0; i < spaces - (spaces / 2); i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = 0; i < spaces - (spaces / 2); i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
-print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- if (!trace_seq_putc(s, ' '))
- return 0;
-
- return trace_print_lat_fmt(s, entry);
+ trace_seq_putc(s, ' ');
+ trace_print_lat_fmt(s, entry);
}
/* If the pid changed since the last trace, output this event */
-static enum print_line_t
+static void
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
- int ret;
if (!data)
- return TRACE_TYPE_HANDLED;
+ return;
last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
if (*last_pid == pid)
- return TRACE_TYPE_HANDLED;
+ return;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
- return TRACE_TYPE_HANDLED;
+ return;
/*
* Context-switch trace line:
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
------------------------------------------
*/
- ret = trace_seq_puts(s,
- " ------------------------------------------\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, prev_pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " => ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s,
- "\n ------------------------------------------\n\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " ------------------------------------------\n");
+ print_graph_cpu(s, cpu);
+ print_graph_proc(s, prev_pid);
+ trace_seq_puts(s, " => ");
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, "\n ------------------------------------------\n\n");
}
static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
return next;
}
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
- return trace_seq_printf(s, "%5lu.%06lu | ",
- (unsigned long)t, usecs_rem);
+ trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
}
-static enum print_line_t
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
- int ret;
struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
- return TRACE_TYPE_UNHANDLED;
+ return;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, " | ");
}
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
}
/* No overhead */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
- ret = trace_seq_puts(s, "==========>");
+ trace_seq_puts(s, "==========>");
else
- ret = trace_seq_puts(s, "<==========");
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "<==========");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ trace_seq_putc(s, '\n');
}
-enum print_line_t
+void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
- char msecs_str[21];
+ char usecs_str[21];
char nsecs_str[5];
- int ret, len;
+ int len;
int i;
- sprintf(msecs_str, "%lu", (unsigned long) duration);
+ sprintf(usecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
- ret = trace_seq_printf(s, "%s", msecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s", usecs_str);
- len = strlen(msecs_str);
+ len = strlen(usecs_str);
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
- ret = trace_seq_printf(s, ".%s", nsecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, ".%s", nsecs_str);
len += strlen(nsecs_str);
}
- ret = trace_seq_puts(s, " us ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = len; i < 7; i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags)
{
- int ret = -1;
-
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return TRACE_TYPE_HANDLED;
+ return;
/* No real adata, just filling the column with spaces */
switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
case FLAGS_FILL_FULL:
- ret = trace_seq_puts(s, " | ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " | ");
+ return;
case FLAGS_FILL_START:
- ret = trace_seq_puts(s, " ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " ");
+ return;
case FLAGS_FILL_END:
- ret = trace_seq_puts(s, " |");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " |");
+ return;
}
/* Signal a overhead of time execution to the output */
- if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- ret = trace_seq_puts(s, "! ");
- /* Duration exceeded 10 msecs */
- else if (duration > 10000ULL)
- ret = trace_seq_puts(s, "+ ");
- }
-
- /*
- * The -1 means we either did not exceed the duration tresholds
- * or we dont want to print out the overhead. Either way we need
- * to fill out the space.
- */
- if (ret == -1)
- ret = trace_seq_puts(s, " ");
-
- /* Catching here any failure happenned above */
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_print_graph_duration(duration, s);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_puts(s, "| ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
+ trace_seq_printf(s, "%c ", trace_find_mark(duration));
+ else
+ trace_seq_puts(s, " ");
- return TRACE_TYPE_HANDLED;
+ trace_print_graph_duration(duration, s);
+ trace_seq_puts(s, "| ");
}
/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- int ret;
int i;
graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
- int ret;
int i;
if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- if (!ret)
+ if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
/*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_NO_CONSUME;
}
-static enum print_line_t
+static void
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
- int ret;
/* Pid */
- if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ verif_pid(s, ent->pid, cpu, data);
- if (type) {
+ if (type)
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return 0;
+ return;
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, ent->pid);
+ trace_seq_puts(s, " | ");
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
- ret = print_graph_lat_fmt(s, ent);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
- return 0;
+ return;
}
/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
- int ret;
int i;
if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
}
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Closing brace */
- for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/*
* If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* belongs to, write out the function name. Always do
* that if the funcgraph-tail option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
- ret = trace_seq_puts(s, "}\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- } else {
- ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
/* Overrun */
- if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
- ret = trace_seq_printf(s, " (Overruns: %lu)\n",
- trace->overrun);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN)
+ trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
- cpu, pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (data)
depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
- for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/* The comment */
- ret = trace_seq_puts(s, "/* ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return ret;
}
+ if (trace_seq_has_overflowed(s))
+ goto out;
+
/* Strip ending newline */
- if (s->buffer[s->len - 1] == '\n') {
- s->buffer[s->len - 1] = '\0';
- s->len--;
+ if (s->buffer[s->seq.len - 1] == '\n') {
+ s->buffer[s->seq.len - 1] = '\0';
+ s->seq.len--;
}
- ret = trace_seq_puts(s, " */\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " */\n");
+ out:
+ return trace_handle_return(s);
}
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
print_lat_header(s, flags);
/* 1st line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " TIME ");
+ seq_puts(s, " TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " CPU");
+ seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " TASK/PID ");
+ seq_puts(s, " TASK/PID ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " DURATION ");
- seq_printf(s, " FUNCTION CALLS\n");
+ seq_puts(s, " DURATION ");
+ seq_puts(s, " FUNCTION CALLS\n");
/* 2nd line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " | | ");
+ seq_puts(s, " | | ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " | | ");
- seq_printf(s, " | | | |\n");
+ seq_puts(s, " | | ");
+ seq_puts(s, " | | | |\n");
}
static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..b0b1c44e923a 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
- if (!trace_empty(&iter))
- trace_find_next_entry_inc(&iter);
- while (!trace_empty(&iter)) {
+
+ while (trace_find_next_entry_inc(&iter)) {
if (!cnt)
kdb_printf("---------------------------------\n");
cnt++;
- if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ if (!skip_lines) {
print_trace_line(&iter);
- if (!skip_lines)
trace_printk_seq(&iter.seq);
- else
+ } else {
skip_lines--;
+ }
+
if (KDB_FLAG(CMD_INTERRUPT))
goto out;
}
@@ -86,9 +88,12 @@ out:
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- for_each_tracing_cpu(cpu)
- if (iter.buffer_iter[cpu])
+ for_each_tracing_cpu(cpu) {
+ if (iter.buffer_iter[cpu]) {
ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ iter.buffer_iter[cpu] = NULL;
+ }
+ }
}
/*
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
int i;
- seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
ftrace_event_name(&tk->tp.call));
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tk->tp.nr_args; i++)
seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
-
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ goto out;
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, " <- "))
- goto partial;
+ trace_seq_puts(s, " <- ");
if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
mmio_reset_data(tr);
}
-static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
{
- int ret = 0;
int i;
resource_size_t start, end;
const struct pci_driver *drv = pci_dev_driver(dev);
- /* XXX: incomplete checks for trace_seq_printf() return value */
- ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
- dev->bus->number, dev->devfn,
- dev->vendor, dev->device, dev->irq);
+ trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
/*
* XXX: is pci_resource_to_user() appropriate, since we are
* supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
*/
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
}
if (drv)
- ret += trace_seq_printf(s, " %s\n", drv->name);
+ trace_seq_printf(s, " %s\n", drv->name);
else
- ret += trace_seq_puts(s, " \n");
- return ret;
+ trace_seq_puts(s, " \n");
}
static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret = 1;
trace_assign_type(field, entry);
rw = &field->rw;
switch (rw->opcode) {
case MMIO_READ:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
(rw->value >> 0) & 0xff, rw->pc, 0);
break;
default:
- ret = trace_seq_puts(s, "rw what?\n");
+ trace_seq_puts(s, "rw what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
trace_assign_type(field, entry);
m = &field->map;
switch (m->opcode) {
case MMIO_PROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
- ret = trace_seq_puts(s, "map what?\n");
+ trace_seq_puts(s, "map what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bputs_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bprint_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_bprintf(s, field->fmt, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct print_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
-
+
trace_seq_putc(p, 0);
return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- int ret;
event = container_of(trace_event, struct ftrace_event_call, event);
entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- return 0;
+ return trace_handle_return(s);
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
- int ret;
-
- ret = trace_seq_printf(s, "%s: ", name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_vprintf(s, fmt, ap);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", name);
+ trace_seq_vprintf(s, fmt, ap);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
}
#endif /* CONFIG_KRETPROBES */
-static int
+static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
-static int
+static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
sprint_symbol(str, address);
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (file) {
ret = trace_seq_path(s, &file->f_path);
if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]",
- ip - vmstart);
+ trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
}
up_read(&mm->mmap_sem);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return !trace_seq_has_overflowed(s);
}
int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
unsigned long sym_flags)
{
struct mm_struct *mm = NULL;
- int ret = 1;
unsigned int i;
if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
unsigned long ip = entry->caller[i];
- if (ip == ULONG_MAX || !ret)
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
break;
- if (ret)
- ret = trace_seq_puts(s, " => ");
+
+ trace_seq_puts(s, " => ");
+
if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- if (ret)
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
continue;
}
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- ret = trace_seq_putc(s, '\n');
+
+ seq_print_user_ip(s, mm, ip, sym_flags);
+ trace_seq_putc(s, '\n');
}
if (mm)
mmput(mm);
- return ret;
+
+ return !trace_seq_has_overflowed(s);
}
int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
- int ret;
-
- if (!ip)
- return trace_seq_putc(s, '0');
+ if (!ip) {
+ trace_seq_putc(s, '0');
+ goto out;
+ }
if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
+ seq_print_sym_offset(s, "%s", ip);
else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
+ seq_print_sym_short(s, "%s", ip);
if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+
+ out:
+ return !trace_seq_has_overflowed(s);
}
/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
- int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.';
- if (!trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq))
- return 0;
+ trace_seq_printf(s, "%c%c%c",
+ irqs_off, need_resched, hardsoft_irq);
if (entry->preempt_count)
- ret = trace_seq_printf(s, "%x", entry->preempt_count);
+ trace_seq_printf(s, "%x", entry->preempt_count);
else
- ret = trace_seq_putc(s, '.');
+ trace_seq_putc(s, '.');
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
- comm, entry->pid, cpu))
- return 0;
+ trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
}
-static unsigned long preempt_mark_thresh_us = 100;
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+ unsigned long long val; /* unit: nsec */
+ char sym;
+} mark[] = {
+ MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(1000000ULL , '#'), /* 1000 usecs */
+ MARK(100000ULL , '!'), /* 100 usecs */
+ MARK(10000ULL , '+'), /* 10 usecs */
+};
+#undef MARK
+
+char trace_find_mark(unsigned long long d)
+{
+ int i;
+ int size = ARRAY_SIZE(mark);
+
+ for (i = 0; i < size; i++) {
+ if (d >= mark[i].val)
+ break;
+ }
+
+ return (i == size) ? ' ' : mark[i].sym;
+}
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
unsigned long rel_msec = (unsigned long)rel_ts;
- return trace_seq_printf(
- s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
- ns2usecs(iter->ts),
- abs_msec, abs_usec,
- rel_msec, rel_usec);
+ trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+
} else if (verbose && !in_ns) {
- return trace_seq_printf(
- s, "[%016llx] %lld (+%lld): ",
- iter->ts, abs_ts, rel_ts);
+ trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+
} else if (!verbose && in_ns) {
- return trace_seq_printf(
- s, " %4lldus%c: ",
- abs_ts,
- rel_ts > preempt_mark_thresh_us ? '!' :
- rel_ts > 1 ? '+' : ' ');
+ trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ trace_find_mark(rel_ts * NSEC_PER_USEC));
+
} else { /* !verbose && !in_ns */
- return trace_seq_printf(s, " %4lld: ", abs_ts);
+ trace_seq_printf(s, " %4lld: ", abs_ts);
}
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
- int ret;
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (!ret)
- return 0;
- if (trace_flags & TRACE_ITER_IRQ_INFO) {
- ret = trace_print_lat_fmt(s, entry);
- if (!ret)
- return 0;
- }
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
t = ns2usecs(iter->ts);
usec_rem = do_div(t, USEC_PER_SEC);
secs = (unsigned long)t;
- return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
} else
- return trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_seq_printf(s, " %12llu: ", iter->ts);
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
- int ret;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
- comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
} else {
- ret = lat_print_generic(s, entry, iter->cpu);
+ lat_print_generic(s, entry, iter->cpu);
}
- if (ret)
- ret = lat_print_timestamp(iter, next_ts);
+ lat_print_timestamp(iter, next_ts);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
goto out;
} else {
-
+
event->type = next_event_type++;
list = &ftrace_event_list;
}
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- if (!trace_seq_puts(s, " <-"))
- goto partial;
- if (!seq_print_ip_sym(s,
- field->parent_ip,
- flags))
- goto partial;
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, field->parent_ip, flags);
}
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
- field->ip,
- field->parent_ip))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_HEX_FIELD(s, field->ip);
+ SEQ_PUT_HEX_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_FIELD(s, field->ip);
+ SEQ_PUT_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
T = task_state_char(field->next_state);
S = task_state_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
- if (!trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, delim,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
if (!S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
+ SEQ_PUT_HEX_FIELD(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD(s, S);
+ SEQ_PUT_HEX_FIELD(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD(s, T);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
+ SEQ_PUT_FIELD(s, field->prev_pid);
+ SEQ_PUT_FIELD(s, field->prev_prio);
+ SEQ_PUT_FIELD(s, field->prev_state);
+ SEQ_PUT_FIELD(s, field->next_cpu);
+ SEQ_PUT_FIELD(s, field->next_pid);
+ SEQ_PUT_FIELD(s, field->next_prio);
+ SEQ_PUT_FIELD(s, field->next_state);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
- if (!trace_seq_puts(s, "<stack trace>\n"))
- goto partial;
+ trace_seq_puts(s, "<stack trace>\n");
for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
- if (!trace_seq_puts(s, " => "))
- goto partial;
- if (!seq_print_ip_sym(s, *p, flags))
- goto partial;
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- }
+ if (trace_seq_has_overflowed(s))
+ break;
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " => ");
+ seq_print_ip_sym(s, *p, flags);
+ trace_seq_putc(s, '\n');
+ }
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!trace_seq_puts(s, "<user stack trace>\n"))
- goto partial;
-
- if (!seq_print_userip_objs(field, s, flags))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, "<user stack trace>\n");
+ seq_print_userip_objs(field, s, flags);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_puts(s, field->str);
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_printf(s, ": %s", field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_printf(s, ": %s", field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(&iter->seq);
}
static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
+#define SEQ_PUT_FIELD(s, x) \
+ trace_seq_putmem(s, &(x), sizeof(x))
+
+#define SEQ_PUT_HEX_FIELD(s, x) \
+ trace_seq_putmem_hex(s, &(x), sizeof(x))
#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
seq_puts(m, "\\t");
break;
case '\\':
- seq_puts(m, "\\");
+ seq_putc(m, '\\');
break;
case '"':
seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
- return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ return !trace_seq_has_overflowed(s); \
} \
const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
int len = *(u32 *)data >> 16;
if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
+ trace_seq_printf(s, " %s=(fault)", name);
else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
+ trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+ return !trace_seq_has_overflowed(s);
}
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
#include "trace.h"
-static struct trace_array *ctx_trace;
-static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
-static int sched_stopped;
-
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
static void
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
-
- local_irq_restore(flags);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu, pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(current);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_wakeup_trace(ctx_trace, wakee, current,
- flags, pc);
-
- local_irq_restore(flags);
}
static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
{
tracing_stop_sched_switch();
}
-
-/**
- * tracing_start_sched_switch_record - start tracing context switches
- *
- * Turns on context switch tracing for a tracer.
- */
-void tracing_start_sched_switch_record(void)
-{
- if (unlikely(!ctx_trace)) {
- WARN_ON(1);
- return;
- }
-
- tracing_start_sched_switch();
-
- mutex_lock(&sched_register_mutex);
- tracer_enabled++;
- mutex_unlock(&sched_register_mutex);
-}
-
-/**
- * tracing_stop_sched_switch_record - start tracing context switches
- *
- * Turns off context switch tracing for a tracer.
- */
-void tracing_stop_sched_switch_record(void)
-{
- mutex_lock(&sched_register_mutex);
- tracer_enabled--;
- WARN_ON(tracer_enabled < 0);
- mutex_unlock(&sched_register_mutex);
-
- tracing_stop_sched_switch();
-}
-
-/**
- * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
- * @tr: trace array pointer to assign
- *
- * Some tracers might want to record the context switches in their
- * trace. This function lets those tracers assign the trace array
- * to use.
- */
-void tracing_sched_switch_assign_trace(struct trace_array *tr)
-{
- ctx_trace = tr;
-}
-
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
wakeup_current_cpu = cpu;
}
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
static void notrace
probe_wakeup_sched_switch(void *ignore,
struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
#include <linux/trace_seq.h>
/* How much buffer is left on the trace_seq? */
-#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
/* How much buffer is written? */
-#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+ if (unlikely(!s->seq.size))
+ trace_seq_init(s);
+}
/**
* trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
*/
int trace_print_seq(struct seq_file *m, struct trace_seq *s)
{
- unsigned int len = TRACE_SEQ_BUF_USED(s);
int ret;
- ret = seq_write(m, s->buffer, len);
+ __trace_seq_init(s);
+
+ ret = seq_buf_print_seq(m, &s->seq);
/*
* Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
* trace_seq_printf() is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ unsigned int save_len = s->seq.len;
va_list ap;
- int ret;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ seq_buf_vprintf(&s->seq, fmt, ap);
va_end(ap);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
* @nmaskbits: The number of bits that are valid in @maskp
*
* Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
int nmaskbits)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
+ __trace_seq_init(s);
- return 1;
+ seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
}
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_printf is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+ __trace_seq_init(s);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_vprintf);
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
*
* This function will take the format and the binary array and finish
* the conversion into the ASCII string within the buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+ seq_buf_bprintf(&s->seq, fmt, binary);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
+ return;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_bprintf);
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
* copy to user routines. This function records a simple string
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_puts(struct trace_seq *s, const char *str)
+void trace_seq_puts(struct trace_seq *s, const char *str)
{
unsigned int len = strlen(str);
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, str, len);
}
EXPORT_SYMBOL_GPL(trace_seq_puts);
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
* copy to user routines. This function records a simple charater
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
- return 0;
+ return;
}
- s->buffer[s->len++] = c;
-
- return 1;
+ seq_buf_putc(&s->seq, c);
}
EXPORT_SYMBOL_GPL(trace_seq_putc);
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
* There may be cases where raw memory needs to be written into the
* buffer and a strcpy() would not work. Using this function allows
* for such cases.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, mem, len);
}
EXPORT_SYMBOL_GPL(trace_seq_putmem);
-#define MAX_MEMHEX_BYTES 8U
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
/**
* trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
* @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
* This is similar to trace_seq_putmem() except instead of just copying the
* raw memory into the buffer it writes its ASCII representation of it
* in hex characters.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
unsigned int len)
{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- unsigned int start_len;
- int i, j;
- int cnt = 0;
+ unsigned int save_len = s->seq.len;
if (s->full)
- return 0;
+ return;
- while (len) {
- start_len = min(len, HEX_CHARS - 1);
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < start_len; i++) {
-#else
- for (i = start_len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- if (WARN_ON_ONCE(j == 0 || j/2 > len))
- break;
-
- /* j increments twice per loop */
- len -= j / 2;
- hex[j++] = ' ';
-
- cnt += trace_seq_putmem(s, hex, j);
+ __trace_seq_init(s);
+
+ /* Each byte is represented by two chars */
+ if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ /* The added spaces can still cause an overflow */
+ seq_buf_putmem_hex(&s->seq, mem, len);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
}
- return cnt;
}
EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
*/
int trace_seq_path(struct trace_seq *s, const struct path *path)
{
- unsigned char *p;
+ unsigned int save_len = s->seq.len;
if (s->full)
return 0;
+ __trace_seq_init(s);
+
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
return 0;
}
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
+ seq_buf_path(&s->seq, path, "\n");
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
}
- s->full = 1;
- return 0;
+ return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_path);
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
+ __trace_seq_init(s);
+ return seq_buf_to_user(&s->seq, ubuf, cnt);
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 29228c4d5696..c6ee36fcbf90 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, ret, syscall;
+ int i, syscall;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
- ret = trace_seq_printf(s, "%s(", entry->name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE) {
- ret = trace_seq_printf(s, "%s ", entry->types[i]);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", entry->types[i]);
+
/* parameter values */
- ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
}
- ret = trace_seq_putc(s, ')');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
+ trace_seq_putc(s, ')');
end:
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
- int ret;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
if (!entry) {
trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ goto out;
}
if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return TRACE_TYPE_UNHANDLED;
}
- ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ out:
+ return trace_handle_return(s);
}
extern char *__bad_type_size(void);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return -ENOMEM;
+ return;
}
for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
meta->syscall_nr = i;
syscalls_metadata[i] = meta;
}
-
- return 0;
}
-early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
return ret;
fail_address_parse:
- if (inode)
- iput(inode);
+ iput(inode);
pr_info("Failed to parse address or file.\n");
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[1], entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
struct probe_arg *parg = &tu->tp.args[i];
if (!parg->type->print(s, parg->name, data + parg->offset, entry))
- goto partial;
+ goto out;
}
- if (trace_seq_puts(s, "\n"))
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!ns_capable(current_user_ns(), CAP_SETGID))
+ if (!may_setgroups())
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .proc_inum = PROC_USER_INIT_INO,
+ .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+ .ns.ops = &userns_operations,
+#endif
+ .flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_PERSISTENT_KEYRINGS
.persistent_keyring_register_sem =
__RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
if (!ns)
return -ENOMEM;
- ret = proc_alloc_inum(&ns->proc_inum);
+ ret = ns_alloc_inum(&ns->ns);
if (ret) {
kmem_cache_free(user_ns_cachep, ns);
return ret;
}
+ ns->ns.ops = &userns_operations;
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
set_cred_user_ns(new, ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
#endif
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
ns = parent;
} while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-
-static DEFINE_MUTEX(id_map_mutex);
-
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
/*
- * The id_map_mutex serializes all writes to any given map.
+ * The userns_state_mutex serializes all writes to any given map.
*
* Any map is only ever written once.
*
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* order and smp_rmb() is guaranteed that we don't have crazy
* architectures returning stale data.
*/
- mutex_lock(&id_map_mutex);
+ mutex_lock(&userns_state_mutex);
ret = -EPERM;
/* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!page)
goto out;
- /* Only allow <= page size writes at the beginning of the file */
+ /* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*ppos = count;
ret = count;
out:
- mutex_unlock(&id_map_mutex);
+ mutex_unlock(&userns_state_mutex);
if (page)
free_page(page);
return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
- /* Allow mapping to your own filesystem ids */
- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+ const struct cred *cred = file->f_cred;
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, file->f_cred->fsuid))
+ if (uid_eq(uid, cred->euid))
return true;
} else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, file->f_cred->fsgid))
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
return true;
}
}
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
return false;
}
-static void *userns_get(struct task_struct *task)
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&userns_state_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+ mutex_unlock(&userns_state_mutex);
+
+ return allowed;
+}
+
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
+static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
user_ns = get_user_ns(__task_cred(task)->user_ns);
rcu_read_unlock();
- return user_ns;
+ return user_ns ? &user_ns->ns : NULL;
}
-static void userns_put(void *ns)
+static void userns_put(struct ns_common *ns)
{
- put_user_ns(ns);
+ put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, void *ns)
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
- struct user_namespace *user_ns = ns;
+ struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
/* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
return commit_creds(cred);
}
-static unsigned int userns_inum(void *ns)
-{
- struct user_namespace *user_ns = ns;
- return user_ns->proc_inum;
-}
-
const struct proc_ns_operations userns_operations = {
.name = "user",
.type = CLONE_NEWUSER,
.get = userns_get,
.put = userns_put,
.install = userns_install,
- .inum = userns_inum,
};
static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
if (!ns)
return ERR_PTR(-ENOMEM);
- err = proc_alloc_inum(&ns->proc_inum);
+ err = ns_alloc_inum(&ns->ns);
if (err) {
kfree(ns);
return ERR_PTR(err);
}
+ ns->ns.ops = &utsns_operations;
+
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
ns = container_of(kref, struct uts_namespace, kref);
put_user_ns(ns->user_ns);
- proc_free_inum(ns->proc_inum);
+ ns_free_inum(&ns->ns);
kfree(ns);
}
-static void *utsns_get(struct task_struct *task)
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static struct ns_common *utsns_get(struct task_struct *task)
{
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
}
task_unlock(task);
- return ns;
+ return ns ? &ns->ns : NULL;
}
-static void utsns_put(void *ns)
+static void utsns_put(struct ns_common *ns)
{
- put_uts_ns(ns);
+ put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
{
- struct uts_namespace *ns = new;
+ struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
return 0;
}
-static unsigned int utsns_inum(void *vp)
-{
- struct uts_namespace *ns = vp;
-
- return ns->proc_inum;
-}
-
const struct proc_ns_operations utsns_operations = {
.name = "uts",
.type = CLONE_NEWUTS,
.get = utsns_get,
.put = utsns_put,
.install = utsns_install,
- .inum = utsns_inum,
};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685daee3d..6202b08f1933 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
+ spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock(&pool->lock);
- spin_unlock_irq(&wq_mayday_lock);
+ spin_unlock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2248,12 +2248,30 @@ repeat:
* Slurp in all works issued via this workqueue and
* process'em.
*/
- WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
+ WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
- process_scheduled_works(rescuer);
+ if (!list_empty(scheduled)) {
+ process_scheduled_works(rescuer);
+
+ /*
+ * The above execution of rescued work items could
+ * have created more to rescue through
+ * pwq_activate_first_delayed() or chained
+ * queueing. Let's put @pwq back on mayday list so
+ * that such back-to-back work items, which may be
+ * being used to relieve memory pressure, don't
+ * incur MAYDAY_INTERVAL delay inbetween.
+ */
+ if (need_to_create_worker(pool)) {
+ spin_lock(&wq_mayday_lock);
+ get_pwq(pwq);
+ list_move_tail(&pwq->mayday_node, &wq->maydays);
+ spin_unlock(&wq_mayday_lock);
+ }
+ }
/*
* Put the reference grabbed by send_mayday(). @pool won't