aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c19
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/hashtab.c64
-rw-r--r--kernel/bpf/inode.c20
-rw-r--r--kernel/bpf/syscall.c22
-rw-r--r--kernel/bpf/verifier.c10
-rw-r--r--kernel/cgroup.c207
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/cpu.c64
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/futex.c67
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/base.c7
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/irq/msi.c8
-rw-r--r--kernel/kcmp.c4
-rw-r--r--kernel/kexec.c10
-rw-r--r--kernel/kexec_core.c7
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kexec_internal.h21
-rw-r--r--kernel/livepatch/core.c176
-rw-r--r--kernel/memremap.c219
-rw-r--r--kernel/module.c349
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/printk/printk.c77
-rw-r--r--kernel/ptrace.c49
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/cputime.c3
-rw-r--r--kernel/sched/idle.c9
-rw-r--r--kernel/seccomp.c22
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c117
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ftrace.c451
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_trigger.c25
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/watchdog.c18
-rw-r--r--kernel/workqueue.c220
67 files changed, 1572 insertions, 1022 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0bf63..d2edd6efec56 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -326,3 +326,4 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
+EXPORT_SYMBOL_GPL(current_is_async);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354a52..3a3e5deeda8d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
-static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
kuid_t audit_sig_uid = INVALID_UID;
@@ -509,8 +508,7 @@ static void flush_hold_queue(void)
* if auditd just disappeared but we
* dequeued an skb we need to drop ref
*/
- if (skb)
- consume_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
@@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy)
skb = skb_dequeue(&audit_skb_queue);
if (skb) {
- if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ if (!audit_backlog_limit ||
+ (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
@@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
if (!ab)
return;
- if (ab->skb)
- kfree_skb(ab->skb);
-
+ kfree_skb(ab->skb);
spin_lock_irqsave(&audit_freelist_lock, flags);
if (audit_freelist_count > AUDIT_MAXFREE)
kfree(ab);
@@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
- if (audit_pid && audit_pid == current->pid)
+ if (audit_pid && audit_pid == current->tgid)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
@@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
- audit_backlog_wait_time = audit_backlog_wait_overflow;
+ audit_backlog_wait_time = 0;
wake_up(&audit_backlog_wait);
return NULL;
}
- if (!reserve)
+ if (!reserve && !audit_backlog_wait_time)
audit_backlog_wait_time = audit_backlog_wait_time_master;
ab = audit_buffer_alloc(ctx, gfp_mask, type);
@@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
/* Copy inode data into an audit_names. */
void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
+ struct inode *inode)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
diff --git a/kernel/audit.h b/kernel/audit.h
index de6cbb7cf547..cbbe6bb6496e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -207,7 +207,7 @@ extern u32 audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
- const struct inode *inode);
+ struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 27c6046c2c3d..f84f8d06e1f6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
if (IS_ERR(dentry))
return (void *)dentry; /* returning an error */
inode = path.dentry->d_inode;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93ac0d..9f194aad0adc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+ inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d_backing_inode(d)->i_sb->s_dev;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..195ffaee50b9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..972d9a8e4ac4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3746fa..c5b30fd8a315 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,11 +14,15 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bucket *buckets;
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
@@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+ if ((u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries >=
U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);
if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
- raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ atomic_set(&htab->count, 0);
return &htab->map;
@@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct hlist_head *head;
+ struct bucket *b;
unsigned long flags;
u32 key_size;
int ret;
@@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
l_new->hash = htab_map_hash(l_new->key, key_size);
+ b = __select_bucket(htab, l_new->hash);
+ head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, l_new->hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
@@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_del_rcu(&l_old->hash_node);
kfree_rcu(l_old, rcu);
} else {
- htab->count++;
+ atomic_inc(&htab->count);
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return 0;
err:
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
kfree(l_new);
return ret;
}
@@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree_rcu(l, rcu);
ret = 0;
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
@@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree(l);
}
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d50b7..f2ece3c174a5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = simple_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
+ .rename = bpf_rename,
+ .link = bpf_link,
.unlink = simple_unlink,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550d8485..637397059f76 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_map *map = filp->private_data;
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries);
+}
+#endif
+
static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
};
int bpf_map_new_fd(struct bpf_map *map)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7945d10b378..d1d3e8f57de9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose("invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 470f6536b9e8..c03a640ef6da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,8 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-
#include <linux/atomic.h>
+#include <net/sock.h>
/*
* pidlists linger the following amount before being destroyed. The goal
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
+static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -440,11 +441,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
return css_tryget(&cgrp->self);
}
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -465,25 +461,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor. It also returns %true
- * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
- while (cgrp) {
- if (cgrp == ancestor)
- return true;
- cgrp = cgroup_parent(cgrp);
- }
- return false;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1647,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
@@ -1717,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
@@ -1924,6 +1888,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
if (ret < 0)
goto out;
root_cgrp->id = ret;
+ root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
GFP_KERNEL);
@@ -2004,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
@@ -2020,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
@@ -2027,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
@@ -2146,9 +2114,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
-
+out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2191,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -4062,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
@@ -4903,11 +4878,11 @@ err_free_css:
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
{
- struct cgroup *parent, *cgrp;
+ struct cgroup *parent, *cgrp, *tcgrp;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- int ssid, ret;
+ int level, ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
*/
@@ -4918,9 +4893,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (!parent)
return -ENODEV;
root = parent->root;
+ level = parent->level + 1;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+ cgrp = kzalloc(sizeof(*cgrp) +
+ sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
if (!cgrp) {
ret = -ENOMEM;
goto out_unlock;
@@ -4944,6 +4921,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
cgrp->self.parent = &parent->self;
cgrp->root = root;
+ cgrp->level = level;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5188,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
@@ -5346,6 +5327,7 @@ int __init cgroup_init(void)
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
@@ -5489,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
-}
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5524,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
}
@@ -5543,7 +5511,7 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
@@ -5556,15 +5524,14 @@ out_revert:
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
-void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
@@ -5577,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5622,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ ss->fork(child);
}
/**
@@ -5822,6 +5788,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it. Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+
+ kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ if (kn) {
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ cgrp = kn->priv;
+ cgroup_get(cgrp);
+ } else {
+ cgrp = ERR_PTR(-ENOTDIR);
+ }
+ kernfs_put(kn);
+ } else {
+ cgrp = ERR_PTR(-ENOENT);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+ pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+
+ rcu_read_lock();
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ skcd->val = (unsigned long)cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+
+ rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c54f2..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167fda..303097b37429 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
*/
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
return pids_try_charge(pids, 1);
}
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e23b..5b9d39633ce9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
- = CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __read_mostly
+ = {CPU_BITS_ALL};
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
-
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
- if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
-{
- if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
-}
-
-void set_cpu_online(unsigned int cpu, bool online)
-{
- if (online) {
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- } else {
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
- }
-}
-
-void set_cpu_active(unsigned int cpu, bool active)
-{
- if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
-}
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
void init_cpu_present(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
+ cpumask_copy(&__cpu_present_mask, src);
}
void init_cpu_possible(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+ cpumask_copy(&__cpu_possible_mask, src);
}
void init_cpu_online(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ cpumask_copy(&__cpu_online_mask, src);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5c9963..3e945fcd8179 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
+#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
+ time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
@@ -1374,7 +1375,7 @@ out:
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
if (ticks == 0)
return;
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4121345498e0..2a20c0dfdafc 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv)
continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
- mod->core_size, (void *)mod);
+ mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
#endif
@@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->module_core);
+ kdb_printf(" 0x%p", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bf8244190d0f..06ae52e99ac2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3376,7 +3376,7 @@ find_lively_task_by_vpid(pid_t vpid)
/* Reuse ptrace permission checks for now. */
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto errout;
return task;
@@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval < 0)
return retval;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..0167679182c0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c6020a0..10e088237fed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,8 +59,6 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct *tsk);
-
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p) &&
- !(p->jobctl & JOBCTL_LISTENING))
+ if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/fork.c b/kernel/fork.c
index 291b08cc817b..2e391c754ae7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,9 +300,9 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", arch_task_struct_size,
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ task_struct_cachep = kmem_cache_create("task_struct",
+ arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
mm->total_vm = oldmm->total_vm;
- mm->shared_vm = oldmm->shared_vm;
+ mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
charge = 0;
@@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
@@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1849,16 +1847,19 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
* whole struct cpumask for the OFFSTACK case. We could change
@@ -1868,8 +1869,9 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 8a310e240cda..0773f2b23b10 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -519,46 +520,9 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ mapping = compound_head(page)->mapping;
+ if (!mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
@@ -588,7 +553,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -603,15 +568,15 @@ again:
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -2919,7 +2884,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
}
ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 55c8c9349cfe..4ae3232e7a28 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
}
ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = p->compat_robust_list;
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb6c1..2f9df37940a0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -123,11 +123,6 @@ void gcov_enable_events(void)
}
#ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
-{
- return ((addr >= start) && (addr < start + size));
-}
-
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within(info, mod->module_core, mod->core_size)) {
+ if (within_module((unsigned long)info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d75179735a28..3e56d2f03e24 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1066,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
__irq_set_handler(virq, handler, 0, handler_name);
irq_set_handler_data(virq, handler_data);
}
+EXPORT_SYMBOL(irq_domain_set_info);
/**
* irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 15b249e7c673..38e89ce7b071 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
if (irq_find_mapping(domain, hwirq) > 0)
return -EEXIST;
- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
- if (ret < 0)
- return ret;
+ if (domain->parent) {
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+ }
for (i = 0; i < nr_irqs; i++) {
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea1d8fd..3a47fa998fe0 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
&task2->signal->cred_guard_mutex);
if (ret)
goto err;
- if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
- !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) {
ret = -EPERM;
goto err_unlock;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d873b64fbddc..ee70aef5cd81 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (ret)
goto out_free_image;
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_image;
-
- /* Enable the special crash kernel control page allocation policy. */
if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c823f3001e12..8dc659144869 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page)
void kimage_free_page_list(struct list_head *list)
{
- struct list_head *pos, *next;
+ struct page *page, *next;
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
+ list_for_each_entry_safe(page, next, list, lru) {
list_del(&page->lru);
kimage_free_pages(page);
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..007b791f676d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
return -EINVAL;
}
+#ifdef CONFIG_KEXEC_VERIFY_SIG
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
return -EKEYREJECTED;
}
+#endif
/* Apply relocations of type RELA */
int __weak
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index e4392a698ad4..0a52315d9c62 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image,
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
+struct kexec_sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+ struct kimage *image;
+ char *buffer;
+ unsigned long bufsz;
+ unsigned long mem;
+ unsigned long memsz;
+ unsigned long buf_align;
+ unsigned long buf_min;
+ unsigned long buf_max;
+ bool top_down; /* allocate from top of memory hole */
+};
+
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index db545cbcdb89..bc2c85c064c1 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <asm/cacheflush.h>
/**
* struct klp_ops - structure for tracking registered ftrace ops structs
@@ -135,13 +136,8 @@ struct klp_find_arg {
const char *objname;
const char *name;
unsigned long addr;
- /*
- * If count == 0, the symbol was not found. If count == 1, a unique
- * match was found and addr is set. If count > 1, there is
- * unresolvable ambiguity among "count" number of symbols with the same
- * name in the same object.
- */
unsigned long count;
+ unsigned long pos;
};
static int klp_find_callback(void *data, const char *name,
@@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name,
if (args->objname && strcmp(args->objname, mod->name))
return 0;
- /*
- * args->addr might be overwritten if another match is found
- * but klp_find_object_symbol() handles this and only returns the
- * addr if count == 1.
- */
args->addr = addr;
args->count++;
+ /*
+ * Finish the search when the symbol is found for the desired position
+ * or the position is not defined for a non-unique symbol.
+ */
+ if ((args->pos && (args->count == args->pos)) ||
+ (!args->pos && (args->count > 1)))
+ return 1;
+
return 0;
}
static int klp_find_object_symbol(const char *objname, const char *name,
- unsigned long *addr)
+ unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
.objname = objname,
.name = name,
.addr = 0,
- .count = 0
+ .count = 0,
+ .pos = sympos,
};
mutex_lock(&module_mutex);
kallsyms_on_each_symbol(klp_find_callback, &args);
mutex_unlock(&module_mutex);
- if (args.count == 0)
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+ * otherwise ensure the symbol position count matches sympos.
+ */
+ if (args.addr == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
- else if (args.count > 1)
+ else if (args.count > 1 && sympos == 0) {
pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
args.count, name, objname);
- else {
+ } else if (sympos != args.count && sympos > 0) {
+ pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+ sympos, name, objname ? objname : "vmlinux");
+ } else {
*addr = args.addr;
return 0;
}
@@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-struct klp_verify_args {
- const char *name;
- const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
-{
- struct klp_verify_args *args = data;
-
- if (!mod &&
- !strcmp(args->name, name) &&
- args->addr == addr)
- return 1;
-
- return 0;
-}
-
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
- struct klp_verify_args args = {
- .name = name,
- .addr = addr,
- };
- int ret;
-
- mutex_lock(&module_mutex);
- ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
- mutex_unlock(&module_mutex);
-
- if (!ret) {
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int klp_find_verify_func_addr(struct klp_object *obj,
- struct klp_func *func)
-{
- int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old_addr accordingly */
- if (kaslr_enabled() && func->old_addr)
- func->old_addr += kaslr_offset();
-#endif
-
- if (!func->old_addr || klp_is_module(obj))
- ret = klp_find_object_symbol(obj->name, func->old_name,
- &func->old_addr);
- else
- ret = klp_verify_vmlinux_symbol(func->old_name,
- func->old_addr);
-
- return ret;
-}
-
/*
* external symbols are located outside the parent object (where the parent
* object is either vmlinux or the kmod being patched).
@@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
}
preempt_enable();
- /* otherwise check if it's in another .o within the patch module */
- return klp_find_object_symbol(pmod->name, name, addr);
+ /*
+ * Check if it's in another .o within the patch module. This also
+ * checks that the external symbol is unique.
+ */
+ return klp_find_object_symbol(pmod->name, name, 0, addr);
}
static int klp_write_object_relocations(struct module *pmod,
struct klp_object *obj)
{
- int ret;
+ int ret = 0;
+ unsigned long val;
struct klp_reloc *reloc;
if (WARN_ON(!klp_is_object_loaded(obj)))
@@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod,
if (WARN_ON(!obj->relocs))
return -EINVAL;
+ module_disable_ro(pmod);
+
for (reloc = obj->relocs; reloc->name; reloc++) {
- if (!klp_is_module(obj)) {
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old value accordingly */
- if (kaslr_enabled())
- reloc->val += kaslr_offset();
-#endif
- ret = klp_verify_vmlinux_symbol(reloc->name,
- reloc->val);
- if (ret)
- return ret;
- } else {
- /* module, reloc->val needs to be discovered */
- if (reloc->external)
- ret = klp_find_external_symbol(pmod,
- reloc->name,
- &reloc->val);
- else
- ret = klp_find_object_symbol(obj->mod->name,
- reloc->name,
- &reloc->val);
- if (ret)
- return ret;
- }
+ /* discover the address of the referenced symbol */
+ if (reloc->external) {
+ if (reloc->sympos > 0) {
+ pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
+ reloc->name);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = klp_find_external_symbol(pmod, reloc->name, &val);
+ } else
+ ret = klp_find_object_symbol(obj->name,
+ reloc->name,
+ reloc->sympos,
+ &val);
+ if (ret)
+ goto out;
+
ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- reloc->val + reloc->addend);
+ val + reloc->addend);
if (ret) {
pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, reloc->val, ret);
- return ret;
+ reloc->name, val, ret);
+ goto out;
}
}
- return 0;
+out:
+ module_enable_ro(pmod);
+ return ret;
}
static void notrace klp_ftrace_handler(unsigned long ip,
@@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
INIT_LIST_HEAD(&func->stack_node);
func->state = KLP_DISABLED;
+ /* The format for the sysfs directory is <function,sympos> where sympos
+ * is the nth occurrence of this symbol in kallsyms for the patched
+ * object. If the user selects 0 for old_sympos, then 1 will be used
+ * since a unique symbol will be the first occurrence.
+ */
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
}
klp_for_each_func(obj, func) {
- ret = klp_find_verify_func_addr(obj, func);
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ func->old_sympos,
+ &func->old_addr);
if (ret)
return ret;
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..e517a16cb426 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
+#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
+pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+{
+ return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
#ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
struct page_map {
struct resource res;
+ struct percpu_ref *ref;
+ struct dev_pagemap pgmap;
+ struct vmem_altmap altmap;
};
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+ percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
+{
+ put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+ resource_size_t key;
+
+ mutex_lock(&pgmap_lock);
+ for (key = res->start; key <= res->end; key += SECTION_SIZE)
+ radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+ const struct resource *res = &page_map->res;
+ struct vmem_altmap *altmap = pgmap->altmap;
+ unsigned long pfn;
+
+ pfn = res->start >> PAGE_SHIFT;
+ if (altmap)
+ pfn += vmem_altmap_offset(altmap);
+ return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
+{
+ const struct resource *res = &page_map->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
{
- struct page_map *page_map = res;
+ struct page_map *page_map = data;
+ struct resource *res = &page_map->res;
+ resource_size_t align_start, align_size;
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+
+ if (percpu_ref_tryget_live(pgmap->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(pgmap->ref);
+ }
+
+ pgmap_radix_release(res);
/* pages are dead and unused, undo the arch mapping */
- arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ arch_remove_memory(align_start, align_size);
+ dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+ "%s: failed to free all reserved pages\n", __func__);
+}
+
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+ struct page_map *page_map;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ return page_map ? &page_map->pgmap : NULL;
}
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ * (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap)
{
int is_ram = region_intersects(res->start, resource_size(res),
"System RAM");
+ resource_size_t key, align_start, align_size;
+ struct dev_pagemap *pgmap;
struct page_map *page_map;
+ unsigned long pfn;
int error, nid;
if (is_ram == REGION_MIXED) {
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
+ if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+ dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+ __func__);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (!ref)
+ return ERR_PTR(-EINVAL);
+
page_map = devres_alloc_node(devm_memremap_pages_release,
sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
if (!page_map)
return ERR_PTR(-ENOMEM);
+ pgmap = &page_map->pgmap;
memcpy(&page_map->res, res, sizeof(*res));
+ pgmap->dev = dev;
+ if (altmap) {
+ memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+ pgmap->altmap = &page_map->altmap;
+ }
+ pgmap->ref = ref;
+ pgmap->res = &page_map->res;
+
+ mutex_lock(&pgmap_lock);
+ error = 0;
+ for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+ struct dev_pagemap *dup;
+
+ rcu_read_lock();
+ dup = find_dev_pagemap(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(dev, "%s: %pr collides with mapping for %s\n",
+ __func__, res, dev_name(dup->dev));
+ error = -EBUSY;
+ break;
+ }
+ error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+ page_map);
+ if (error) {
+ dev_err(dev, "%s: failed: %d\n", __func__, error);
+ break;
+ }
+ }
+ mutex_unlock(&pgmap_lock);
+ if (error)
+ goto err_radix;
+
nid = dev_to_node(dev);
if (nid < 0)
nid = numa_mem_id();
- error = arch_add_memory(nid, res->start, resource_size(res), true);
- if (error) {
- devres_free(page_map);
- return ERR_PTR(error);
- }
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ error = arch_add_memory(nid, align_start, align_size, true);
+ if (error)
+ goto err_add_memory;
+ for_each_device_pfn(pfn, page_map) {
+ struct page *page = pfn_to_page(pfn);
+
+ /* ZONE_DEVICE pages must never appear on a slab lru */
+ list_force_poison(&page->lru);
+ page->pgmap = pgmap;
+ }
devres_add(dev, page_map);
return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+ pgmap_radix_release(res);
+ devres_free(page_map);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+ /*
+ * 'memmap_start' is the virtual address for the first "struct
+ * page" in this range of the vmemmap array. In the case of
+ * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+ * pointer arithmetic, so we can perform this to_vmem_altmap()
+ * conversion without concern for the initialization state of
+ * the struct page fields.
+ */
+ struct page *page = (struct page *) memmap_start;
+ struct dev_pagemap *pgmap;
+
+ /*
+ * Uncoditionally retrieve a dev_pagemap associated with the
+ * given physical address, this is only for use in the
+ * arch_{add|remove}_memory() for setting up and tearing down
+ * the memmap.
+ */
+ rcu_read_lock();
+ pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ rcu_read_unlock();
+
+ return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 38c7bd5583ff..8358f4697c0c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -80,15 +80,6 @@
# define debug_align(X) (X)
#endif
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
- (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
- PFN_DOWN((unsigned long)BASE) + 1) \
- : (0UL))
-
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -108,13 +99,6 @@ static LIST_HEAD(modules);
* Use a latched RB-tree for __module_address(); this allows us to use
* RCU-sched lookups of the address from any context.
*
- * Because modules have two address ranges: init and core, we need two
- * latch_tree_nodes entries. Therefore we need the back-pointer from
- * mod_tree_node.
- *
- * Because init ranges are short lived we mark them unlikely and have placed
- * them outside the critical cacheline in struct module.
- *
* This is conditional on PERF_EVENTS || TRACING because those can really hit
* __module_address() hard by doing a lot of stack unwinding; potentially from
* NMI context.
@@ -122,24 +106,16 @@ static LIST_HEAD(modules);
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->module_init;
-
- return (unsigned long)mod->module_core;
+ return (unsigned long)layout->base;
}
static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
-
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->init_size;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- return (unsigned long)mod->core_size;
+ return (unsigned long)layout->size;
}
static __always_inline bool
@@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node)
*/
static void mod_tree_insert(struct module *mod)
{
- mod->mtn_core.mod = mod;
- mod->mtn_init.mod = mod;
+ mod->core_layout.mtn.mod = mod;
+ mod->init_layout.mtn.mod = mod;
- __mod_tree_insert(&mod->mtn_core);
- if (mod->init_size)
- __mod_tree_insert(&mod->mtn_init);
+ __mod_tree_insert(&mod->core_layout.mtn);
+ if (mod->init_layout.size)
+ __mod_tree_insert(&mod->init_layout.mtn);
}
static void mod_tree_remove_init(struct module *mod)
{
- if (mod->init_size)
- __mod_tree_remove(&mod->mtn_init);
+ if (mod->init_layout.size)
+ __mod_tree_remove(&mod->init_layout.mtn);
}
static void mod_tree_remove(struct module *mod)
{
- __mod_tree_remove(&mod->mtn_core);
+ __mod_tree_remove(&mod->core_layout.mtn);
mod_tree_remove_init(mod);
}
@@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size)
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->module_core, mod->core_size);
- if (mod->init_size)
- __mod_update_bounds(mod->module_init, mod->init_size);
+ __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ if (mod->init_layout.size)
+ __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
}
#ifdef CONFIG_KGDB_KDB
@@ -1214,7 +1190,7 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_size);
+ return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
}
static struct module_attribute modinfo_coresize =
@@ -1223,7 +1199,7 @@ static struct module_attribute modinfo_coresize =
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_size);
+ return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
}
static struct module_attribute modinfo_initsize =
@@ -1873,64 +1849,75 @@ static void mod_sysfs_teardown(struct module *mod)
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
+ *
+ * General layout of module is:
+ * [text] [read-only-data] [writable data]
+ * text_size -----^ ^ ^
+ * ro_size ------------------------| |
+ * size -------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
*/
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+static void frob_text(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
- unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base,
+ layout->text_size >> PAGE_SHIFT);
+}
- if (end_pfn > begin_pfn)
- set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+static void frob_rodata(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->text_size,
+ (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
-static void set_section_ro_nx(void *base,
- unsigned long text_size,
- unsigned long ro_size,
- unsigned long total_size)
+static void frob_writable_data(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- /* begin and end PFNs of the current subsection */
- unsigned long begin_pfn;
- unsigned long end_pfn;
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->size - layout->ro_size) >> PAGE_SHIFT);
+}
- /*
- * Set RO for module text and RO-data:
- * - Always protect first page.
- * - Do not protect last partial page.
- */
- if (ro_size > 0)
- set_page_attributes(base, base + ro_size, set_memory_ro);
+/* livepatching wants to disable read-only so it can frob module. */
+void module_disable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
+ frob_rodata(&mod->init_layout, set_memory_rw);
+}
- /*
- * Set NX permissions for module data:
- * - Do not protect first partial page.
- * - Always protect last page.
- */
- if (total_size > text_size) {
- begin_pfn = PFN_UP((unsigned long)base + text_size);
- end_pfn = PFN_UP((unsigned long)base + total_size);
- if (end_pfn > begin_pfn)
- set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
- }
+void module_enable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_rodata(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
+ frob_rodata(&mod->init_layout, set_memory_ro);
}
-static void unset_module_core_ro_nx(struct module *mod)
+static void module_enable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_core + mod->core_text_size,
- mod->module_core + mod->core_size,
- set_memory_x);
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_writable_data(&mod->core_layout, set_memory_nx);
+ frob_rodata(&mod->init_layout, set_memory_nx);
+ frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void unset_module_init_ro_nx(struct module *mod)
+static void module_disable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_init + mod->init_text_size,
- mod->module_init + mod->init_size,
- set_memory_x);
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_x);
+ frob_writable_data(&mod->core_layout, set_memory_x);
+ frob_rodata(&mod->init_layout, set_memory_x);
+ frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
@@ -1942,16 +1929,9 @@ void set_all_modules_text_rw(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_rw);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_rw);
- }
+
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
}
mutex_unlock(&module_mutex);
}
@@ -1965,23 +1945,25 @@ void set_all_modules_text_ro(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_ro);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_ro);
- }
+
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
}
mutex_unlock(&module_mutex);
}
+
+static void disable_ro_nx(const struct module_layout *layout)
+{
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_x);
+ frob_writable_data(layout, set_memory_x);
+}
+
#else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
+static void disable_ro_nx(const struct module_layout *layout) { }
+static void module_enable_nx(const struct module *mod) { }
+static void module_disable_nx(const struct module *mod) { }
#endif
void __weak module_memfree(void *module_region)
@@ -2033,19 +2015,19 @@ static void free_module(struct module *mod)
synchronize_sched();
mutex_unlock(&module_mutex);
- /* This may be NULL, but that's OK */
- unset_module_init_ro_nx(mod);
+ /* This may be empty, but that's OK */
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
+ module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
/* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- unset_module_core_ro_nx(mod);
- module_memfree(mod->module_core);
+ disable_ro_nx(&mod->core_layout);
+ module_memfree(mod->core_layout.base);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -2248,20 +2230,20 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| strstarts(sname, ".init"))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+ s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->core_size = debug_align(mod->core_size);
- mod->core_text_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.text_size = mod->core_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->core_size = debug_align(mod->core_size);
- mod->core_ro_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_size = mod->core_layout.size;
break;
case 3: /* whole core */
- mod->core_size = debug_align(mod->core_size);
+ mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
}
@@ -2277,21 +2259,21 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| !strstarts(sname, ".init"))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+ s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->init_size = debug_align(mod->init_size);
- mod->init_text_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.text_size = mod->init_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->init_size = debug_align(mod->init_size);
- mod->init_ro_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.ro_size = mod->init_layout.size;
break;
case 3: /* whole init */
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
}
@@ -2401,7 +2383,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
if (sym->st_shndx == SHN_UNDEF)
return 'U';
- if (sym->st_shndx == SHN_ABS)
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
return 'a';
if (sym->st_shndx >= SHN_LORESERVE)
return '?';
@@ -2430,7 +2412,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
@@ -2439,6 +2421,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|| !src->st_name)
return false;
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
@@ -2466,7 +2453,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+ symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
@@ -2476,23 +2463,24 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
}
/* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_size += strtab_size;
- mod->core_size = debug_align(mod->core_size);
+ info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_layout.size += strtab_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+ strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
@@ -2513,12 +2501,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
for (i = 0; i < mod->num_symtab; i++)
mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
- mod->core_symtab = dst = mod->module_core + info->symoffs;
- mod->core_strtab = s = mod->module_core + info->stroffs;
+ mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_strtab = s = mod->core_layout.base + info->stroffs;
src = mod->symtab;
for (ndst = i = 0; i < mod->num_symtab; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_strtab;
s += strlcpy(s, &mod->strtab[src[i].st_name],
@@ -2964,7 +2953,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc(mod->core_size);
+ ptr = module_alloc(mod->core_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2974,11 +2963,11 @@ static int move_module(struct module *mod, struct load_info *info)
if (!ptr)
return -ENOMEM;
- memset(ptr, 0, mod->core_size);
- mod->module_core = ptr;
+ memset(ptr, 0, mod->core_layout.size);
+ mod->core_layout.base = ptr;
- if (mod->init_size) {
- ptr = module_alloc(mod->init_size);
+ if (mod->init_layout.size) {
+ ptr = module_alloc(mod->init_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -2987,13 +2976,13 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_memfree(mod->module_core);
+ module_memfree(mod->core_layout.base);
return -ENOMEM;
}
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ memset(ptr, 0, mod->init_layout.size);
+ mod->init_layout.base = ptr;
} else
- mod->module_init = NULL;
+ mod->init_layout.base = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -3005,10 +2994,10 @@ static int move_module(struct module *mod, struct load_info *info)
continue;
if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->module_init
+ dest = mod->init_layout.base
+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + shdr->sh_entsize;
+ dest = mod->core_layout.base + shdr->sh_entsize;
if (shdr->sh_type != SHT_NOBITS)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -3070,12 +3059,12 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
+ if (mod->init_layout.base)
+ flush_icache_range((unsigned long)mod->init_layout.base,
+ (unsigned long)mod->init_layout.base
+ + mod->init_layout.size);
+ flush_icache_range((unsigned long)mod->core_layout.base,
+ (unsigned long)mod->core_layout.base + mod->core_layout.size);
set_fs(old_fs);
}
@@ -3133,8 +3122,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- module_memfree(mod->module_core);
+ module_memfree(mod->init_layout.base);
+ module_memfree(mod->core_layout.base);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3221,7 +3210,7 @@ static noinline int do_init_module(struct module *mod)
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->module_init;
+ freeinit->module_init = mod->init_layout.base;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3279,12 +3268,12 @@ static noinline int do_init_module(struct module *mod)
mod->strtab = mod->core_strtab;
#endif
mod_tree_remove_init(mod);
- unset_module_init_ro_nx(mod);
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
+ mod->init_layout.base = NULL;
+ mod->init_layout.size = 0;
+ mod->init_layout.ro_size = 0;
+ mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
@@ -3373,17 +3362,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+ /* Set RO and NX regions */
+ module_enable_ro(mod);
+ module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
@@ -3548,8 +3529,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
MODULE_STATE_GOING, mod);
/* we can't deallocate the module until we clear memory protection */
- unset_module_init_ro_nx(mod);
- unset_module_core_ro_nx(mod);
+ module_disable_ro(mod);
+ module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(info->debug);
@@ -3578,7 +3559,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
module_deallocate(mod, info);
free_copy:
@@ -3656,9 +3637,9 @@ static const char *get_ksymbol(struct module *mod,
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
else
- nextval = (unsigned long)mod->module_core+mod->core_text_size;
+ nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
@@ -3905,7 +3886,7 @@ static int m_show(struct seq_file *m, void *p)
return 0;
seq_printf(m, "%s %u",
- mod->name, mod->init_size + mod->core_size);
+ mod->name, mod->init_layout.size + mod->core_layout.size);
print_unload_info(m, mod);
/* Informative for users. */
@@ -3914,7 +3895,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->core_layout.base);
/* Taints info */
if (mod->taints)
@@ -4057,8 +4038,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->module_init, mod->init_text_size)
- && !within(addr, mod->module_core, mod->core_text_size))
+ if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+ && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
mod = NULL;
}
return mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index b333380c6bb2..d96469de72dc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_trylock();
- console_unlock();
+ console_flush_on_panic();
if (!panic_blink)
panic_blink = no_blink;
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..f4ad91b746f1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 02e8dfaa1ce2..68d3ebc12601 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -235,7 +235,7 @@ config PM_TRACE_RTC
config APM_EMULATION
tristate "Advanced Power Management Emulation"
- depends on PM && SYS_SUPPORTS_APM_EMULATION
+ depends on SYS_SUPPORTS_APM_EMULATION
help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999900..27946975eff0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
}
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb566e82b..efe1b3b17c88 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..c963ba534a78 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
+#include <asm-generic/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -232,7 +233,11 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
@@ -273,11 +278,7 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
#define LOG_ALIGN __alignof__(struct printk_log)
-#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static int recursion_bug;
+ static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
+ recursion_bug = true;
local_irq_restore(flags);
return 0;
}
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
static const char recursion_msg[] =
"BUG: recent printk recursion!";
- recursion_bug = 0;
+ recursion_bug = false;
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg,
@@ -2233,13 +2234,24 @@ void console_unlock(void)
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
- bool retry;
+ bool do_cond_resched, retry;
if (console_suspended) {
up_console_sem();
return;
}
+ /*
+ * Console drivers are called under logbuf_lock, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system.
+ */
+ do_cond_resched = console_may_schedule;
console_may_schedule = 0;
/* flush buffered message fragment immediately to console */
@@ -2311,6 +2323,9 @@ skip:
call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
+
+ if (do_cond_resched)
+ cond_resched();
}
console_locked = 0;
@@ -2378,6 +2393,25 @@ void console_unblank(void)
console_unlock();
}
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+ /*
+ * If someone else is holding the console lock, trylock will fail
+ * and may_schedule may be set. Ignore and proceed to unlock so
+ * that messages are flushed out. As this can be called from any
+ * context and we don't want to get preempted while flushing,
+ * ensure may_schedule is cleared.
+ */
+ console_trylock();
+ console_may_schedule = 0;
+ console_unlock();
+}
+
/*
* Return the console tty driver structure and its associated index
*/
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
static int __init printk_late_init(void)
{
struct console *con;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ /*
+ * Make sure to unregister boot consoles whose data
+ * resides in the init section before the init section
+ * is discarded. Boot consoles whose data will stick
+ * around will automatically be unregistered when the
+ * proper console replaces them.
+ */
+ if (init_section_intersects(con, sizeof(*con)))
+ unregister_console(con);
}
}
hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b760bae64cf1..2341efe7fe02 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
+ int dumpable = 0;
+ kuid_t caller_uid;
+ kgid_t caller_gid;
+
+ if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
+ WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
+ return -EPERM;
+ }
/* May we inspect the given task?
* This check is used both for attaching with ptrace
@@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* because setting up the necessary parent/child relationship
* or halting the specified task is impossible.
*/
- int dumpable = 0;
+
/* Don't let security modules deny introspection */
if (same_thread_group(task, current))
return 0;
rcu_read_lock();
+ if (mode & PTRACE_MODE_FSCREDS) {
+ caller_uid = cred->fsuid;
+ caller_gid = cred->fsgid;
+ } else {
+ /*
+ * Using the euid would make more sense here, but something
+ * in userland might rely on the old behavior, and this
+ * shouldn't be a security problem since
+ * PTRACE_MODE_REALCREDS implies that the caller explicitly
+ * used a syscall that requests access to another process
+ * (and not a filesystem syscall to procfs).
+ */
+ caller_uid = cred->uid;
+ caller_gid = cred->gid;
+ }
tcred = __task_cred(task);
- if (uid_eq(cred->uid, tcred->euid) &&
- uid_eq(cred->uid, tcred->suid) &&
- uid_eq(cred->uid, tcred->uid) &&
- gid_eq(cred->gid, tcred->egid) &&
- gid_eq(cred->gid, tcred->sgid) &&
- gid_eq(cred->gid, tcred->gid))
+ if (uid_eq(caller_uid, tcred->euid) &&
+ uid_eq(caller_uid, tcred->suid) &&
+ uid_eq(caller_uid, tcred->uid) &&
+ gid_eq(caller_gid, tcred->egid) &&
+ gid_eq(caller_gid, tcred->sgid) &&
+ gid_eq(caller_gid, tcred->gid))
goto ok;
if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
@@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request,
goto out;
task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
task_unlock(task);
if (retval)
goto unlock_creds;
@@ -364,8 +387,14 @@ unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
if (!retval) {
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
+ */
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
diff --git a/kernel/relay.c b/kernel/relay.c
index 0b4570cfacae..074994bcfa9b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&file_inode(filp)->i_mutex);
+ inode_lock(file_inode(filp));
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&file_inode(filp)->i_mutex);
+ inode_unlock(file_inode(filp));
return desc->written;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index f150dbbe6f62..09c0597840b0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if ((p->flags & IORESOURCE_BUSY) == 0)
+ continue;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5b0a..bc54e84675da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77d97a6fc715..63d3a24e081a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i = sched_feat_set(cmp);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d5ff5c6bf829..b2ab2ffb1adc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..544a7133cbd1 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -97,12 +97,6 @@ void default_idle_call(void)
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0) {
- default_idle_call();
- return next_state;
- }
-
/*
* The idle task must be scheduled, it is pointless to go to idle, just
* update no idle residency and return.
@@ -168,7 +162,7 @@ static void cpuidle_idle_call(void)
*/
if (idle_should_freeze()) {
entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state >= 0) {
+ if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
@@ -219,6 +213,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d4024f..15a1795bbba1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void)
put_seccomp_filter(thread);
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
/*
* Opt the other thread into seccomp if needed.
* As threads are considered to be trust-realm
* equivalent (see ptrace_may_access), it is safe to
* allow one thread to transition the other.
*/
- if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
- /*
- * Don't let an unprivileged task work around
- * the no_new_privs restriction by creating
- * a thread that sets it up, enters seccomp,
- * then dies.
- */
- if (task_no_new_privs(caller))
- task_set_no_new_privs(thread);
-
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
- }
}
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index edb6de4f5908..a467e6c28a3b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
mutex_unlock(&stop_cpus_mutex);
return ret ?: done.ret;
}
-
-#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys.c b/kernel/sys.c
index 6af9212ab5aa..78947de6f969 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
}
- if (prctl_map.exe_fd != (u32)-1)
+ if (prctl_map.exe_fd != (u32)-1) {
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
- down_read(&mm->mmap_sem);
- if (error)
- goto out;
+ if (error)
+ return error;
+ }
+
+ down_write(&mm->mmap_sem);
/*
* We don't validate if these members are pointing to
@@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- error = 0;
-out:
- up_read(&mm->mmap_sem);
- return error;
+ up_write(&mm->mmap_sem);
+ return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
- down_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
vma = find_vma(mm, addr);
prctl_map.start_code = mm->start_code;
@@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
- up_read(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
return error;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787ec67a..2c5e3a8e00d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d6639e..97715fd9e790 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -173,7 +173,7 @@ extern int no_unaligned_warning;
#define SYSCTL_WRITES_WARN 0
#define SYSCTL_WRITES_STRICT 1
-static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
@@ -1735,6 +1757,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -2047,9 +2083,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
int *i, vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2078,15 +2113,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first=0) {
@@ -2094,11 +2123,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
if (!left)
break;
- err = proc_get_long(&kbuf, &left, &lval, &neg,
+ err = proc_get_long(&p, &left, &lval, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2125,10 +2154,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2310,9 +2338,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
{
unsigned long *i, *min, *max;
int vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2340,15 +2367,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first = 0) {
@@ -2357,9 +2378,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (write) {
bool neg;
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
- err = proc_get_long(&kbuf, &left, &val, &neg,
+ err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2385,10 +2406,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2650,34 +2670,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
if (write) {
- unsigned long page = 0;
- char *kbuf;
+ char *kbuf, *p;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- free_page(page);
- return -EFAULT;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
GFP_KERNEL);
if (!tmp_bitmap) {
- free_page(page);
+ kfree(kbuf);
return -ENOMEM;
}
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
- err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
if (err)
break;
@@ -2688,12 +2701,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
val_b = val_a;
if (left) {
- kbuf++;
+ p++;
left--;
}
if (c == '-') {
- err = proc_get_long(&kbuf, &left, &val_b,
+ err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
if (err)
@@ -2704,16 +2717,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
break;
}
if (left) {
- kbuf++;
+ p++;
left--;
}
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
}
- free_page(page);
+ kfree(kbuf);
} else {
unsigned long bit_a, bit_b = 0;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99ef0df12807..9d7a053545f5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
}
@@ -387,7 +387,7 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-static int tick_nohz_enabled __read_mostly = 1;
+int tick_nohz_enabled __read_mostly = 1;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
@@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now);
local_irq_restore(flags);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
/*
@@ -717,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int
update_cpu_load_nohz(active);
calc_load_exit_idle();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824c8604..2aeb6ffc0a1e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
- msg = kmalloc(count + 1, GFP_KERNEL);
- if (msg == NULL)
- return -ENOMEM;
-
- if (copy_from_user(msg, buffer, count)) {
- kfree(msg);
- return -EFAULT;
- }
+ msg = memdup_user_nul(buffer, count);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
- msg[count] = '\0';
bt = filp->private_data;
__trace_note_message(bt, "%s", msg);
kfree(msg);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd3682c3..45dd798bcd37 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -316,7 +316,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b147247..eca592f977b2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
*per_cpu_ptr(ops->disabled, cpu) = 1;
}
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
{
int __percpu *disabled;
+ if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+ return -EINVAL;
+
disabled = alloc_percpu(int);
if (!disabled)
return -ENOMEM;
ops->disabled = disabled;
- control_ops_disable_all(ops);
+ per_cpu_ops_disable_all(ops);
return 0;
}
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
+ * If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static void add_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int first = *list == &ftrace_list_end;
- add_ftrace_ops(list, ops);
- if (first)
- add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int ret = remove_ftrace_ops(list, ops);
- if (!ret && *list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
- return ret;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops);
static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- if (control_ops_alloc(ops))
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+ if (per_cpu_ops_alloc(ops))
return -ENOMEM;
- add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
- /* The control_ops needs the trampoline update */
- ops = &control_ops;
- } else
- add_ftrace_ops(&ftrace_ops_list, ops);
+ }
+
+ add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- ret = remove_ftrace_list_ops(&ftrace_control_list,
- &control_ops, ops);
- } else
- ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
if (ret < 0)
return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (all) {
/*
* Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
}
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
{
int i;
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+ switch (ftrace_bug_type) {
+ case FTRACE_BUG_UNKNOWN:
+ break;
+ case FTRACE_BUG_INIT:
+ pr_info("Initializing ftrace call sites\n");
+ break;
+ case FTRACE_BUG_NOP:
+ pr_info("Setting ftrace call site to NOP\n");
+ break;
+ case FTRACE_BUG_CALL:
+ pr_info("Setting ftrace call site to call ftrace function\n");
+ break;
+ case FTRACE_BUG_UPDATE:
+ pr_info("Updating ftrace call site to call a different ftrace function\n");
+ break;
+ }
+}
/**
* ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
- print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
+ if (ftrace_expected) {
+ print_ip_ins(" expected: ", ftrace_expected);
+ pr_cont("\n");
+ }
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ print_bug_type();
if (rec) {
struct ftrace_ops *ops = NULL;
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
rec->flags & FTRACE_FL_REGS ? " R" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- pr_cont("\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ pr_cont("\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
pr_cont("\ttramp: ERROR!");
}
ip = ftrace_get_addr_curr(rec);
- pr_cont(" expected tramp: %lx\n", ip);
+ pr_cont("\n expected tramp: %lx\n", ip);
}
}
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ return FTRACE_UPDATE_IGNORE;
+
/*
* If we are updating calls:
*
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* from the save regs, to a non-save regs function or
* vice versa, or from a trampoline call.
*/
- if (flag & FTRACE_FL_ENABLED)
+ if (flag & FTRACE_FL_ENABLED) {
+ ftrace_bug_type = FTRACE_BUG_CALL;
return FTRACE_UPDATE_MAKE_CALL;
+ }
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return FTRACE_UPDATE_MODIFY_CALL;
}
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
FTRACE_FL_REGS_EN);
}
+ ftrace_bug_type = FTRACE_BUG_NOP;
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+ struct ftrace_ops *op)
+{
+ unsigned long ip = rec->ip;
+
+ while_for_each_ftrace_op(op) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ }
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
+ ftrace_bug_type = FTRACE_BUG_CALL;
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
+ ftrace_bug_type = FTRACE_BUG_NOP;
return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
+ ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
return 0;
}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
}
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
+ * If these are per_cpu ops, they still need their
* per_cpu field freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
return 0;
}
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the control
+ * The same goes for freeing the per_cpu data of the per_cpu
* ops.
*
* Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
arch_ftrace_trampoline_free(ops);
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
}
return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- /* If ops traces all mods, we already accounted for it */
+ /* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 0;
+ return 1;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 1;
}
-static int referenced_filters(struct dyn_ftrace *rec)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
- }
-
- return cnt;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long update_cnt = 0;
- unsigned long ref = 0;
- bool test = false;
+ unsigned long rec_flags = 0;
int i;
+ start = ftrace_now(raw_smp_processor_id());
+
/*
- * When adding a module, we need to check if tracers are
- * currently enabled and if they are set to trace all functions.
- * If they are, we need to enable the module functions as well
- * as update the reference counts for those function records.
+ * When a module is loaded, this function is called to convert
+ * the calls to mcount in its text to nops, and also to create
+ * an entry in the ftrace data. Now, if ftrace is activated
+ * after this call, but before the module sets its text to
+ * read-only, the modification of enabling ftrace can fail if
+ * the read-only is done while ftrace is converting the calls.
+ * To prevent this, the module's records are set as disabled
+ * and will be enabled after the call to set the module's text
+ * to read-only.
*/
- if (mod) {
- struct ftrace_ops *ops;
-
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- if (ops_traces_mod(ops))
- ref++;
- else
- test = true;
- }
- }
- }
-
- start = ftrace_now(raw_smp_processor_id());
+ if (mod)
+ rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
- int cnt = ref;
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- if (test)
- cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags = rec_flags;
/*
* Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
break;
update_cnt++;
-
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && cnt) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed)
- ftrace_bug(failed, p);
- }
}
}
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- struct ftrace_ops *ops = NULL;
+ struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- seq_printf(m, "\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ seq_printf(m, "\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ add_trampoline_func(m, ops, rec);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
seq_puts(m, "\ttramp: ERROR!");
-
+ } else {
+ add_trampoline_func(m, NULL, rec);
}
- add_trampoline_func(m, ops, rec);
}
seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
@@ -4940,41 +4961,112 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+static void ftrace_module_enable(struct module *mod)
{
- if (ftrace_disabled || start == end)
- return;
- ftrace_process_locs(mod, start, end);
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+
+ mutex_lock(&ftrace_lock);
+
+ if (ftrace_disabled)
+ goto out_unlock;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ *
+ * We also delay this to after the module code already set the
+ * text to read-only, as we now need to set it back to read-write
+ * so that we can modify the text.
+ */
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_prepare();
+
+ do_for_each_ftrace_rec(pg, rec) {
+ int cnt;
+ /*
+ * do_for_each_ftrace_rec() is a double loop.
+ * module text shares the pg. If a record is
+ * not part of this module, then skip this pg,
+ * which the "break" will do.
+ */
+ if (!within_module_core(rec->ip, mod))
+ break;
+
+ cnt = 0;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are, and can trace this record,
+ * we need to enable the module functions as well as update the
+ * reference counts for those function records.
+ */
+ if (ftrace_start_up)
+ cnt += referenced_filters(rec);
+
+ /* This clears FTRACE_FL_DISABLED */
+ rec->flags = cnt;
+
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(rec, 1);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ goto out_loop;
+ }
+ }
+
+ } while_for_each_ftrace_rec();
+
+ out_loop:
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
}
void ftrace_module_init(struct module *mod)
{
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
+ if (ftrace_disabled || !mod->num_ftrace_callsites)
+ return;
+
+ ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
}
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- if (val == MODULE_STATE_GOING)
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ftrace_module_enable(mod);
+ break;
+ case MODULE_STATE_GOING:
ftrace_release_mod(mod);
+ break;
+ default:
+ break;
+ }
return 0;
}
#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_exit_nb = {
- .notifier_call = ftrace_module_notify_exit,
+struct notifier_block ftrace_module_nb = {
+ .notifier_call = ftrace_module_notify,
.priority = INT_MIN, /* Run after anything that can remove kprobes */
};
@@ -5006,7 +5098,7 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_exit_nb);
+ ret = register_module_notifier(&ftrace_module_nb);
if (ret)
pr_warning("Failed to register trace ftrace module exit notifier\n");
@@ -5116,44 +5208,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
- return;
-
- /*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
- */
- preempt_disable_notrace();
- trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
- do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!(op->flags & FTRACE_OPS_FL_STUB) &&
- !ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip, regs))
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
- out:
- trace_recursion_clear(TRACE_CONTROL_BIT);
- preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(control_ops)
-};
-
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5224,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs)) {
+ /*
+ * Check the following for each ops before calling their func:
+ * if RCU flag is set, then rcu_is_watching() must be true
+ * if PER_CPU is set, then ftrace_function_local_disable()
+ * must be false
+ * Otherwise test if the ip matches the ops filter
+ *
+ * If any of the above fails then the op->func() is not executed.
+ */
+ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+ (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) &&
+ ftrace_ops_test(op, ip, regs)) {
+
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -5195,7 +5263,7 @@ out:
* being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5280,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
/*
* If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
*/
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
+ if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+ return;
+
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
- op->func(ip, parent_ip, op, regs);
+ preempt_disable_notrace();
+ if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) {
+ op->func(ip, parent_ip, op, regs);
+ }
+
+ preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -5243,12 +5320,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the func handles its own recursion, call it directly.
- * Otherwise call the recursion protected function that
- * will call the ftrace ops function.
+ * If the function does not handle recursion, needs to be RCU safe,
+ * or does per cpu logic, then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
- return ftrace_ops_recurs_func;
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+ ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ return ftrace_ops_assist_func;
return ops->func;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27ba3..95181e36891a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
/*
* rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
*/
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
- struct buffer_page *old_tail;
unsigned long old_entries;
unsigned long old_write;
- int ret = 0;
/*
* The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
* it is, then it is up to us to update the tail
* pointer.
*/
- if (tail_page == cpu_buffer->tail_page) {
+ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- old_tail = cmpxchg(&cpu_buffer->tail_page,
- tail_page, next_page);
-
- if (old_tail == tail_page)
- ret = 1;
+ /* Again, either we update tail_page or an interrupt does */
+ (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
-
- return ret;
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
+ struct buffer_page *buffer_tail_page;
+
+ buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
- if (cpu_buffer->tail_page != tail_page &&
- cpu_buffer->tail_page != next_page)
+ if (buffer_tail_page != tail_page &&
+ buffer_tail_page != next_page)
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
/*
* This is the slow path, force gcc not to inline it.
*/
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
- u64 ts;
next_page = tail_page;
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
}
- ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
- if (ret) {
- /*
- * Nested commits always have zero deltas, so
- * just reread the time stamp
- */
- ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = ts;
- }
+ rb_tail_page_update(cpu_buffer, tail_page, next_page);
out_again:
rb_reset_tail(cpu_buffer, tail, info);
+ /* Commit what we have for now. */
+ rb_end_commit(cpu_buffer);
+ /* rb_end_commit() decs committing */
+ local_inc(&cpu_buffer->committing);
+
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
addr = (unsigned long)event;
addr &= PAGE_MASK;
- bpage = cpu_buffer->tail_page;
+ bpage = READ_ONCE(cpu_buffer->tail_page);
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
again:
max_count = cpu_buffer->nr_pages * 100;
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ /* Only update the write stamp if the page has an event */
+ if (rb_page_write(cpu_buffer->commit_page))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp))
info->length += RB_LEN_TIME_EXTEND;
- tail_page = info->tail_page = cpu_buffer->tail_page;
+ /* Don't let the compiler play games with cpu_buffer->tail_page */
+ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 87fb9801bd9e..d9293402ee68 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d07686f..8414fa40bf27 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
* @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
- TRACE_CONTROL_BIT,
-
TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..00df25fd86ef 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
return register_ftrace_function(ops);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4f6ef6912e00..f333e57c4614 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1340,15 +1340,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
mutex_lock(&event_mutex);
file = event_file_data(filp);
@@ -1356,7 +1350,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
@@ -1507,18 +1501,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
err = apply_subsystem_event_filter(dir, buf);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 42a4009fd75a..b38f617b6181 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long)buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
strim(buf);
mutex_lock(&event_mutex);
event_file = event_file_data(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
return -ENODEV;
}
ret = trigger_process_regex(event_file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
if (ret < 0)
goto out;
@@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
@@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 88fefa68c516..9bafc211930c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent *extent = NULL;
- unsigned long page = 0;
- char *kbuf, *pos, *next_line;
+ char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
/*
@@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
- /* Get a buffer */
- ret = -ENOMEM;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!page)
- goto out;
-
/* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
/* Slurp in the user data */
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, count))
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf)) {
+ ret = PTR_ERR(kbuf);
+ kbuf = NULL;
goto out;
- kbuf[count] = '\0';
+ }
/* Parse the user data */
ret = -EINVAL;
@@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = count;
out:
mutex_unlock(&userns_state_mutex);
- if (page)
- free_page(page);
+ kfree(kbuf);
return ret;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 84b5035cb6a5..b3ace6ebbba3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <linux/tick.h>
+#include <linux/workqueue.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -225,7 +226,15 @@ static void __touch_watchdog(void)
__this_cpu_write(watchdog_touch_ts, get_timestamp());
}
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state. This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void)
*/
raw_cpu_write(watchdog_touch_ts, 0);
}
+
+void touch_softlockup_watchdog(void)
+{
+ touch_softlockup_watchdog_sched();
+ wq_watchdog_touch(raw_smp_processor_id());
+}
EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void)
@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void)
*/
for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
+ wq_watchdog_touch(-1);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c579dbab2e36..61a0264e28f9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
+ unsigned long watchdog_ts; /* L: watchdog timestamp */
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
+ if (list_empty(&pwq->pool->worklist))
+ pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++;
@@ -1385,6 +1389,8 @@ retry:
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
+ if (list_empty(worklist))
+ pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
@@ -2157,6 +2163,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
+ pool->watchdog_ts = jiffies;
+
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
@@ -2240,6 +2248,7 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
+ bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2256,9 +2265,14 @@ repeat:
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_pwq(work) == pwq)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ if (first)
+ pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ }
+ first = false;
+ }
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
@@ -2316,6 +2330,37 @@ repeat:
goto repeat;
}
+/**
+ * check_flush_dependency - check for flush dependency sanity
+ * @target_wq: workqueue being flushed
+ * @target_work: work item being flushed (NULL for workqueue flushes)
+ *
+ * %current is trying to flush the whole @target_wq or @target_work on it.
+ * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
+ * reclaiming memory or running on a workqueue which doesn't have
+ * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
+ * a deadlock.
+ */
+static void check_flush_dependency(struct workqueue_struct *target_wq,
+ struct work_struct *target_work)
+{
+ work_func_t target_func = target_work ? target_work->func : NULL;
+ struct worker *worker;
+
+ if (target_wq->flags & WQ_MEM_RECLAIM)
+ return;
+
+ worker = current_wq_worker();
+
+ WARN_ONCE(current->flags & PF_MEMALLOC,
+ "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+ current->pid, current->comm, target_wq->name, target_func);
+ WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+ "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+ worker->current_pwq->wq->name, worker->current_func,
+ target_wq->name, target_func);
+}
+
struct wq_barrier {
struct work_struct work;
struct completion done;
@@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
+ check_flush_dependency(wq, NULL);
+
mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
@@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
pwq = worker->current_pwq;
}
+ check_flush_dependency(pwq->wq, work);
+
insert_wq_barrier(pwq, barr, work, worker);
spin_unlock_irq(&pool->lock);
@@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
+ pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;
- int ret = -ENOMEM;
/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
@@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
ctx = apply_wqattrs_prepare(wq, attrs);
+ if (!ctx)
+ return -ENOMEM;
/* the ctx has been prepared successfully, let's commit it */
- if (ctx) {
- apply_wqattrs_commit(ctx);
- ret = 0;
- }
-
+ apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);
- return ret;
+ return 0;
}
/**
@@ -4308,7 +4355,9 @@ void show_workqueue_state(void)
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" workers=%d", pool->nr_workers);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
@@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely. Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+ TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+ int cpu;
+
+ wq_watchdog_touched = jiffies;
+ for_each_possible_cpu(cpu)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ bool lockup_detected = false;
+ struct worker_pool *pool;
+ int pi;
+
+ if (!thresh)
+ return;
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ unsigned long pool_ts, touched, ts;
+
+ if (list_empty(&pool->worklist))
+ continue;
+
+ /* get the latest of pool and touched timestamps */
+ pool_ts = READ_ONCE(pool->watchdog_ts);
+ touched = READ_ONCE(wq_watchdog_touched);
+
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+
+ if (pool->cpu >= 0) {
+ unsigned long cpu_touched =
+ READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+ pool->cpu));
+ if (time_after(cpu_touched, ts))
+ ts = cpu_touched;
+ }
+
+ /* did we stall? */
+ if (time_after(jiffies, ts + thresh)) {
+ lockup_detected = true;
+ pr_emerg("BUG: workqueue lockup - pool");
+ pr_cont_pool_info(pool);
+ pr_cont(" stuck for %us!\n",
+ jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (lockup_detected)
+ show_workqueue_state();
+
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+ if (cpu >= 0)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ else
+ wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+ wq_watchdog_thresh = 0;
+ del_timer_sync(&wq_watchdog_timer);
+
+ if (thresh) {
+ wq_watchdog_thresh = thresh;
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+ }
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long thresh;
+ int ret;
+
+ ret = kstrtoul(val, 0, &thresh);
+ if (ret)
+ return ret;
+
+ if (system_wq)
+ wq_watchdog_set_thresh(thresh);
+ else
+ wq_watchdog_thresh = thresh;
+
+ return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+ .set = wq_watchdog_param_set_thresh,
+ .get = param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+ 0644);
+
+static void wq_watchdog_init(void)
+{
+ wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else /* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif /* CONFIG_WQ_WATCHDOG */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
@@ -5290,6 +5487,9 @@ static int __init init_workqueues(void)
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+
+ wq_watchdog_init();
+
return 0;
}
early_initcall(init_workqueues);