aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_iter.c539
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/btf.c49
-rw-r--r--kernel/bpf/cgroup.c146
-rw-r--r--kernel/bpf/core.c10
-rw-r--r--kernel/bpf/cpumap.c25
-rw-r--r--kernel/bpf/devmap.c133
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c125
-rw-r--r--kernel/bpf/inode.c5
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_in_map.c2
-rw-r--r--kernel/bpf/map_iter.c102
-rw-r--r--kernel/bpf/net_namespace.c373
-rw-r--r--kernel/bpf/queue_stack_maps.c4
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c501
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c597
-rw-r--r--kernel/bpf/task_iter.c353
-rw-r--r--kernel/bpf/verifier.c387
-rw-r--r--kernel/bpf/xskmap.c265
-rw-r--r--kernel/cgroup/cgroup.c27
-rw-r--r--kernel/cgroup/namespace.c5
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/debug/debug_core.c57
-rw-r--r--kernel/debug/kdb/kdb_main.c11
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/exit.c34
-rw-r--r--kernel/irq/Kconfig1
-rw-r--r--kernel/irq/irq_sim.c267
-rw-r--r--kernel/irq/irqdomain.c53
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq_work.c53
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/locking/lockdep.c86
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/nsproxy.c305
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/relay.c7
-rw-r--r--kernel/sched/core.c257
-rw-r--r--kernel/sched/cpuacct.c7
-rw-r--r--kernel/sched/debug.c9
-rw-r--r--kernel/sched/fair.c262
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/pelt.c24
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h36
-rw-r--r--kernel/sched/smp.h9
-rw-r--r--kernel/sched/topology.c33
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/smp.c175
-rw-r--r--kernel/sysctl.c3077
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/bpf_trace.c241
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace_preemptirq.c39
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/user_namespace.c8
-rw-r--r--kernel/utsname.c5
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/watchdog.c12
69 files changed, 6026 insertions, 2825 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f2d7be596966..1131a921e1a6 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,9 +2,9 @@
obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -12,10 +12,8 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
+obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1d6120fd5ba6..11584618e861 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
+ bool bypass_spec_v1 = bpf_bypass_spec_v1();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
struct bpf_array *array;
@@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
mask64 -= 1;
index_mask = mask64;
- if (unpriv) {
+ if (!bypass_spec_v1) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
@@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
}
array->index_mask = index_mask;
- array->map.unpriv_array = unpriv;
+ array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
@@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
@@ -1058,7 +1058,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..dd612b80b9fe
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,539 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+struct bpf_iter_target_info {
+ struct list_head list;
+ const struct bpf_iter_reg *reg_info;
+ u32 btf_id; /* cached value */
+};
+
+struct bpf_iter_link {
+ struct bpf_link link;
+ struct bpf_iter_target_info *tinfo;
+};
+
+struct bpf_iter_priv_data {
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u64 session_id;
+ u64 seq_num;
+ bool done_stop;
+ u8 target_private[] __aligned(8);
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num--;
+}
+
+static void bpf_iter_done_stop(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->done_stop = true;
+}
+
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * no_llseek is assumed for this file.
+ * The following are differences from seq_read():
+ * . fixed buffer size (PAGE_SIZE)
+ * . assuming no_llseek
+ * . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ size_t n, offs, copied = 0;
+ int err = 0;
+ void *p;
+
+ mutex_lock(&seq->lock);
+
+ if (!seq->buf) {
+ seq->size = PAGE_SIZE;
+ seq->buf = kmalloc(seq->size, GFP_KERNEL);
+ if (!seq->buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+ }
+
+ if (seq->count) {
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf + seq->from, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ seq->count -= n;
+ seq->from += n;
+ copied = n;
+ goto done;
+ }
+
+ seq->from = 0;
+ p = seq->op->start(seq, &seq->index);
+ if (!p)
+ goto stop;
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ /* object is skipped, decrease seq_num, so next
+ * valid object can reuse the same seq_num.
+ */
+ bpf_iter_dec_seq_num(seq);
+ seq->count = 0;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ while (1) {
+ loff_t pos = seq->index;
+
+ offs = seq->count;
+ p = seq->op->next(seq, p, &seq->index);
+ if (pos == seq->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ seq->op->next);
+ seq->index++;
+ }
+
+ if (IS_ERR_OR_NULL(p))
+ break;
+
+ /* got a valid next object, increase seq_num */
+ bpf_iter_inc_seq_num(seq);
+
+ if (seq->count >= size)
+ break;
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ bpf_iter_dec_seq_num(seq);
+ seq->count = offs;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ seq->count = offs;
+ if (offs == 0) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+ }
+stop:
+ offs = seq->count;
+ /* bpf program called if !p */
+ seq->op->stop(seq, p);
+ if (!p) {
+ if (!seq_has_overflowed(seq)) {
+ bpf_iter_done_stop(seq);
+ } else {
+ seq->count = offs;
+ if (offs == 0) {
+ err = -E2BIG;
+ goto done;
+ }
+ }
+ }
+
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ copied = n;
+ seq->count -= n;
+ seq->from = n;
+done:
+ if (!copied)
+ copied = err;
+ else
+ *ppos += copied;
+ mutex_unlock(&seq->lock);
+ return copied;
+}
+
+static int iter_open(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_link *link = inode->i_private;
+
+ return prepare_seq_file(file, link);
+}
+
+static int iter_release(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ if (!seq)
+ return 0;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+
+ if (iter_priv->tinfo->reg_info->fini_seq_private)
+ iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+
+ bpf_prog_put(iter_priv->prog);
+ seq->private = iter_priv;
+
+ return seq_release_private(inode, file);
+}
+
+const struct file_operations bpf_iter_fops = {
+ .open = iter_open,
+ .llseek = no_llseek,
+ .read = bpf_seq_read,
+ .release = iter_release,
+};
+
+/* The argument reg_info will be cached in bpf_iter_target_info.
+ * The common practice is to declare target reg_info as
+ * a const static variable and passed as an argument to
+ * bpf_iter_reg_target().
+ */
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ return -ENOMEM;
+
+ tinfo->reg_info = reg_info;
+ INIT_LIST_HEAD(&tinfo->list);
+
+ mutex_lock(&targets_mutex);
+ list_add(&tinfo->list, &targets);
+ mutex_unlock(&targets_mutex);
+
+ return 0;
+}
+
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+ bool found = false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (reg_info == tinfo->reg_info) {
+ list_del(&tinfo->list);
+ kfree(tinfo);
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ WARN_ON(found == false);
+}
+
+static void cache_btf_id(struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ tinfo->btf_id = prog->aux->attach_btf_id;
+}
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+ const char *attach_fname = prog->aux->attach_func_name;
+ u32 prog_btf_id = prog->aux->attach_btf_id;
+ const char *prefix = BPF_ITER_FUNC_PREFIX;
+ struct bpf_iter_target_info *tinfo;
+ int prefix_len = strlen(prefix);
+ bool supported = false;
+
+ if (strncmp(attach_fname, prefix, prefix_len))
+ return false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+ supported = true;
+ break;
+ }
+ if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
+ cache_btf_id(tinfo, prog);
+ supported = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ if (supported) {
+ prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
+ prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
+ }
+
+ return supported;
+}
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ kfree(iter_link);
+}
+
+static int bpf_iter_link_replace(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ int ret = 0;
+
+ mutex_lock(&link_mutex);
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (link->prog->type != new_prog->type ||
+ link->prog->expected_attach_type != new_prog->expected_attach_type ||
+ link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&link_mutex);
+ return ret;
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+ .release = bpf_iter_link_release,
+ .dealloc = bpf_iter_link_dealloc,
+ .update_prog = bpf_iter_link_replace,
+};
+
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+ return link->ops == &bpf_iter_link_lops;
+}
+
+int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_iter_link *link;
+ bool existed = false;
+ u32 prog_btf_id;
+ int err;
+
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ prog_btf_id = prog->aux->attach_btf_id;
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog_btf_id) {
+ existed = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+ if (!existed)
+ return -ENOENT;
+
+ link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+ link->tinfo = tinfo;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+}
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+ struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ priv_data->tinfo = tinfo;
+ priv_data->prog = prog;
+ priv_data->session_id = atomic64_inc_return(&session_id);
+ priv_data->seq_num = 0;
+ priv_data->done_stop = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+ struct bpf_iter_priv_data *priv_data;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u32 total_priv_dsize;
+ struct seq_file *seq;
+ int err = 0;
+
+ mutex_lock(&link_mutex);
+ prog = link->link.prog;
+ bpf_prog_inc(prog);
+ mutex_unlock(&link_mutex);
+
+ tinfo = link->tinfo;
+ total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+ tinfo->reg_info->seq_priv_size;
+ priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+ total_priv_dsize);
+ if (!priv_data) {
+ err = -ENOMEM;
+ goto release_prog;
+ }
+
+ if (tinfo->reg_info->init_seq_private) {
+ err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+ if (err)
+ goto release_seq_file;
+ }
+
+ init_seq_meta(priv_data, tinfo, prog);
+ seq = file->private_data;
+ seq->private = priv_data->target_private;
+
+ return 0;
+
+release_seq_file:
+ seq_release_private(file->f_inode, file);
+ file->private_data = NULL;
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+ struct file *file;
+ unsigned int flags;
+ int err, fd;
+
+ if (link->ops != &bpf_iter_link_lops)
+ return -EINVAL;
+
+ flags = O_RDONLY | O_CLOEXEC;
+ fd = get_unused_fd_flags(flags);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto free_fd;
+ }
+
+ err = prepare_seq_file(file,
+ container_of(link, struct bpf_iter_link, link));
+ if (err)
+ goto free_file;
+
+ fd_install(fd, file);
+ return fd;
+
+free_file:
+ fput(file);
+free_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+ void *seq_priv;
+
+ seq = meta->seq;
+ if (seq->file->f_op != &bpf_iter_fops)
+ return NULL;
+
+ seq_priv = seq->private;
+ iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+ target_private);
+
+ if (in_stop && iter_priv->done_stop)
+ return NULL;
+
+ meta->session_id = iter_priv->session_id;
+ meta->seq_num = iter_priv->seq_num;
+
+ return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+ int ret;
+
+ rcu_read_lock();
+ migrate_disable();
+ ret = BPF_PROG_RUN(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+
+ /* bpf program can only return 0 or 1:
+ * 0 : okay
+ * 1 : retry the same object
+ * The bpf_iter_run_prog() return value
+ * will be seq_ops->show() return value.
+ */
+ return ret == 0 ? 0 : -EAGAIN;
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 19636703b24e..fb278144e9fd 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -49,6 +49,6 @@ const struct bpf_prog_ops lsm_prog_ops = {
};
const struct bpf_verifier_ops lsm_verifier_ops = {
- .get_func_proto = bpf_tracing_func_proto,
+ .get_func_proto = tracing_prog_func_proto,
.is_valid_access = btf_ctx_access,
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 26cb51f2db72..c6b0decaa46a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
struct bpf_map *map;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d65c6912bdaf..58c9af1d4808 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3482,6 +3482,7 @@ extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
@@ -3508,6 +3509,7 @@ static u8 bpf_ctx_convert_map[] = {
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
@@ -3692,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
u32 nr_args, arg;
- int ret;
+ int i, ret;
if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@@ -3789,6 +3791,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+ if (ctx_arg_info->offset == off) {
+ info->reg_type = ctx_arg_info->reg_type;
+ break;
+ }
+ }
if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
@@ -3828,6 +3838,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
+ u32 vlen;
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@@ -3836,7 +3847,43 @@ again:
return -EINVAL;
}
+ vlen = btf_type_vlen(t);
if (off + size > t->size) {
+ /* If the last element is a variable size array, we may
+ * need to relax the rule.
+ */
+ struct btf_array *array_elem;
+
+ if (vlen == 0)
+ goto error;
+
+ member = btf_type_member(t) + vlen - 1;
+ mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
+ NULL);
+ if (!btf_type_is_array(mtype))
+ goto error;
+
+ array_elem = (struct btf_array *)(mtype + 1);
+ if (array_elem->nelems != 0)
+ goto error;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ if (off < moff)
+ goto error;
+
+ /* Only allow structure for now, can be relaxed for
+ * other types later.
+ */
+ elem_type = btf_type_skip_modifiers(btf_vmlinux,
+ array_elem->type, NULL);
+ if (!btf_type_is_struct(elem_type))
+ goto error;
+
+ off = (off - moff) % elem_type->size;
+ return btf_struct_access(log, elem_type, off, size, atype,
+ next_btf_id);
+
+error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cb305e71e7de..fdf7836750a3 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -557,8 +557,9 @@ found:
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
- struct bpf_prog *new_prog)
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
{
struct list_head *progs = &cgrp->bpf.progs[link->type];
struct bpf_prog *old_prog;
@@ -583,6 +584,30 @@ int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
return 0;
}
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ mutex_lock(&cgroup_mutex);
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
@@ -808,17 +833,56 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
kfree(cg_link);
}
-const struct bpf_link_ops bpf_cgroup_link_lops = {
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ cg_link->type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = cg_link->type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
.release = bpf_cgroup_link_release,
.dealloc = bpf_cgroup_link_dealloc,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
};
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_cgroup_link *link;
- struct file *link_file;
struct cgroup *cgrp;
- int err, link_fd;
+ int err;
if (attr->link_create.flags)
return -EINVAL;
@@ -832,26 +896,25 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_cgroup;
}
- bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog);
link->cgroup = cgrp;
link->type = attr->link_create.attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_cgroup;
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
BPF_F_ALLOW_MULTI);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_cgroup:
cgroup_put(cgrp);
@@ -1054,36 +1117,21 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return !allow;
}
-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* fall through */
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -1137,16 +1185,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @head: sysctl table header
* @table: sysctl table
* @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
* result is size of @new_buf if program set new value, initial value
* otherwise
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- * overrides new value provided by user space on sysctl write
- * NOTE: it's caller responsibility to free *new_buf if it was set
* @type: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
@@ -1157,8 +1202,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
+ void **buf, size_t *pcount, loff_t *ppos,
enum bpf_attach_type type)
{
struct bpf_sysctl_kern ctx = {
@@ -1173,36 +1217,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
.new_updated = 0,
};
struct cgroup *cgrp;
+ loff_t pos = 0;
int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
- if (ctx.cur_val) {
- mm_segment_t old_fs;
- loff_t pos = 0;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
- &ctx.cur_len, &pos)) {
- /* Let BPF program decide how to proceed. */
- ctx.cur_len = 0;
- }
- set_fs(old_fs);
- } else {
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
/* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
- if (write && buf && *pcount) {
+ if (write && *buf && *pcount) {
/* BPF program should be able to override new value with a
* buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
- if (!ctx.new_val ||
- copy_from_user(ctx.new_val, buf, ctx.new_len))
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
/* Let BPF program decide how to proceed. */
ctx.new_len = 0;
+ }
}
rcu_read_lock();
@@ -1213,7 +1249,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.cur_val);
if (ret == 1 && ctx.new_updated) {
- *new_buf = ctx.new_val;
+ kfree(*buf);
+ *buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
kfree(ctx.new_val);
@@ -1221,7 +1258,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
@@ -1326,7 +1362,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
@@ -1413,7 +1448,6 @@ out:
sockopt_free_buf(&ctx);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cf6fe9107f5c..9df4cc9a2907 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return;
bpf_prog_ksym_set_addr(fp);
@@ -1543,7 +1543,7 @@ select_insn:
/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
- * handeled like a helper, that is, bpf_tail_call_proto,
+ * handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
@@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
return res;
}
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
@@ -2151,6 +2156,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3fe0b006d2d2..27595fc6da56 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u64 cost;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
@@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
/* Part of headroom was reserved to xdpf */
hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
- /* build_skb need to place skb_shared_info after SKB end, and
- * also want to know the memory "truesize". Thus, need to
- * know the memory frame size backing xdp_buff.
- *
- * XDP was designed to have PAGE_SIZE frames, but this
- * assumption is not longer true with ixgbe and i40e. It
- * would be preferred to set frame_size to 2048 or 4096
- * depending on the driver.
- * frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_frame);
- *
- * Instead, with info avail, skb_shared_info in placed after
- * packet len. This, unfortunately fakes the truesize.
- * Another disadvantage of this approach, the skb_shared_info
- * is not at a fixed memory location, with mixed length
- * packets, which is bad for cache-line hotness.
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ frame_size = xdpf->frame_sz;
pkt_data_start = xdpf->data - hard_start_headroom;
skb = build_skb_around(skb, pkt_data_start, frame_size);
@@ -636,7 +621,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
{
struct xdp_frame *xdpf;
- xdpf = convert_to_xdp_frame(xdp);
+ xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 58bdca5d978a..854b09beb16b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -52,7 +52,6 @@
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-#define DEV_MAP_BULK_SIZE 16
struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
@@ -61,12 +60,23 @@ struct xdp_dev_bulk_queue {
unsigned int count;
};
+/* DEVMAP values */
+struct bpf_devmap_val {
+ u32 ifindex; /* device index */
+ union {
+ int fd; /* prog fd on map write */
+ u32 id; /* prog id on map read */
+ } bpf_prog;
+};
+
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
+ struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
+ struct bpf_devmap_val val;
};
struct bpf_dtab {
@@ -106,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
+ u32 valsize = attr->value_size;
u64 cost = 0;
int err;
- /* check sanity of attributes */
+ /* check sanity of attributes. 2 value sizes supported:
+ * 4 bytes: ifindex
+ * 8 bytes: ifindex + prog fd
+ */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
+ valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
@@ -218,6 +234,8 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -232,6 +250,8 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -318,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
+bool dev_map_can_have_prog(struct bpf_map *map)
+{
+ if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
+ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
+ map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
+ return true;
+
+ return false;
+}
+
static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
@@ -435,13 +465,40 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(err))
return err;
- xdpf = convert_to_xdp_frame(xdp);
+ xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
return bq_enqueue(dev, xdpf, dev_rx);
}
+static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ u32 act;
+
+ xdp->txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ return xdp;
+ case XDP_DROP:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ break;
+ }
+
+ xdp_return_buff(xdp);
+ return NULL;
+}
+
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
@@ -453,6 +510,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
{
struct net_device *dev = dst->dev;
+ if (dst->xdp_prog) {
+ xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
+ if (!xdp)
+ return 0;
+ }
return __xdp_enqueue(dev, xdp, dev_rx);
}
@@ -473,18 +535,15 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
*(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
-
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void __dev_map_entry_free(struct rcu_head *rcu)
@@ -492,6 +551,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -542,9 +603,10 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab *dtab,
- u32 ifindex,
+ struct bpf_devmap_val *val,
unsigned int idx)
{
+ struct bpf_prog *prog = NULL;
struct bpf_dtab_netdev *dev;
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
@@ -552,24 +614,46 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- kfree(dev);
- return ERR_PTR(-EINVAL);
+ dev->dev = dev_get_by_index(net, val->ifindex);
+ if (!dev->dev)
+ goto err_out;
+
+ if (val->bpf_prog.fd >= 0) {
+ prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+ BPF_PROG_TYPE_XDP, false);
+ if (IS_ERR(prog))
+ goto err_put_dev;
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ goto err_put_prog;
}
dev->idx = idx;
dev->dtab = dtab;
+ if (prog) {
+ dev->xdp_prog = prog;
+ dev->val.bpf_prog.id = prog->aux->id;
+ } else {
+ dev->xdp_prog = NULL;
+ dev->val.bpf_prog.id = 0;
+ }
+ dev->val.ifindex = val->ifindex;
return dev;
+err_put_prog:
+ bpf_prog_put(prog);
+err_put_dev:
+ dev_put(dev->dev);
+err_out:
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
}
static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
u32 i = *(u32 *)key;
if (unlikely(map_flags > BPF_EXIST))
@@ -579,10 +663,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (!ifindex) {
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (!val.ifindex) {
dev = NULL;
+ /* can not specify fd if ifindex is 0 */
+ if (val.bpf_prog.fd != -1)
+ return -EINVAL;
} else {
- dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ dev = __dev_map_alloc_node(net, dtab, &val, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
@@ -609,13 +699,16 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
u32 idx = *(u32 *)key;
unsigned long flags;
int err = -EEXIST;
- if (unlikely(map_flags > BPF_EXIST || !ifindex))
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
return -EINVAL;
spin_lock_irqsave(&dtab->index_lock, flags);
@@ -624,7 +717,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
if (old_dev && (map_flags & BPF_NOEXIST))
goto out_err;
- dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+ dev = __dev_map_alloc_node(net, dtab, &val, idx);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out_err;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d541c8486c95..b4b288a3c3c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !bpf_capable())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bafc53ddd350..be43ab3e619f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -151,7 +151,19 @@ BPF_CALL_0(bpf_ktime_get_ns)
const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.func = bpf_ktime_get_ns,
- .gpl_only = true,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_boot_ns)
+{
+ /* NMI safe access to clock boottime */
+ return ktime_get_boot_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
+ .func = bpf_ktime_get_boot_ns,
+ .gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -562,3 +574,114 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
+ default:
+ break;
+ }
+
+ if (!bpf_capable())
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_trace_printk:
+ if (!perfmon_capable())
+ return NULL;
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ default:
+ break;
+ }
+
+ if (!perfmon_capable())
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
+ case BPF_FUNC_probe_read_user:
+ return &bpf_probe_read_user_proto;
+ case BPF_FUNC_probe_read_kernel:
+ return &bpf_probe_read_kernel_proto;
+ case BPF_FUNC_probe_read_user_str:
+ return &bpf_probe_read_user_str_proto;
+ case BPF_FUNC_probe_read_kernel_str:
+ return &bpf_probe_read_kernel_str_proto;
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 95087d9f4ed3..fb878ba3f22f 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
{
+ struct bpf_link *link = arg;
+
return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
- &bpffs_obj_fops);
+ bpf_link_is_iter(link) ?
+ &bpf_iter_fops : &bpffs_obj_fops);
}
static struct dentry *
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 65c236cf341e..c8cc4e4cf98d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
u64 cost = sizeof(*trie), cost_per_node;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index b3c48d1533cb..17738c93bec8 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
if (inner_map->ops == &array_map_ops) {
- inner_map_meta->unpriv_array = inner_map->unpriv_array;
+ inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
container_of(inner_map_meta, struct bpf_array, map)->index_mask =
container_of(inner_map, struct bpf_array, map)->index_mask;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..c69071e334bf
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+ u32 mid;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+ struct bpf_map *map;
+
+ map = bpf_map_get_curr_or_next(&info->mid);
+ if (!map)
+ return NULL;
+
+ ++*pos;
+ return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+ struct bpf_map *map;
+
+ ++*pos;
+ ++info->mid;
+ bpf_map_put((struct bpf_map *)v);
+ map = bpf_map_get_curr_or_next(&info->mid);
+ if (!map)
+ return NULL;
+
+ return map;
+}
+
+struct bpf_iter__bpf_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
+
+static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_map ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.map = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_map_seq_show(seq, v, false);
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_map_seq_show(seq, v, true);
+ else
+ bpf_map_put((struct bpf_map *)v);
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+ .start = bpf_map_seq_start,
+ .next = bpf_map_seq_next,
+ .stop = bpf_map_seq_stop,
+ .show = bpf_map_seq_show,
+};
+
+static const struct bpf_iter_reg bpf_map_reg_info = {
+ .target = "bpf_map",
+ .seq_ops = &bpf_map_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map, map),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_map_iter_init(void)
+{
+ return bpf_iter_reg_target(&bpf_map_reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
new file mode 100644
index 000000000000..78cf061f8179
--- /dev/null
+++ b/kernel/bpf/net_namespace.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/net_namespace.h>
+
+/*
+ * Functions to manage BPF programs attached to netns
+ */
+
+struct bpf_netns_link {
+ struct bpf_link link;
+ enum bpf_attach_type type;
+ enum netns_bpf_attach_type netns_type;
+
+ /* We don't hold a ref to net in order to auto-detach the link
+ * when netns is going away. Instead we rely on pernet
+ * pre_exit callback to clear this pointer. Must be accessed
+ * with netns_bpf_mutex held.
+ */
+ struct net *net;
+};
+
+/* Protects updates to netns_bpf */
+DEFINE_MUTEX(netns_bpf_mutex);
+
+/* Must be called with netns_bpf_mutex held. */
+static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+
+ net_link->net = NULL;
+}
+
+static void bpf_netns_link_release(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct net *net;
+
+ /* Link auto-detached by dying netns. */
+ if (!net_link->net)
+ return;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Recheck after potential sleep. We can race with cleanup_net
+ * here, but if we see a non-NULL struct net pointer pre_exit
+ * has not happened yet and will block on netns_bpf_mutex.
+ */
+ net = net_link->net;
+ if (!net)
+ goto out_unlock;
+
+ net->bpf.links[type] = NULL;
+ RCU_INIT_POINTER(net->bpf.progs[type], NULL);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static void bpf_netns_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+
+ kfree(net_link);
+}
+
+static int bpf_netns_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct net *net;
+ int ret = 0;
+
+ if (old_prog && old_prog != link->prog)
+ return -EPERM;
+ if (new_prog->type != link->prog->type)
+ return -EINVAL;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ net = net_link->net;
+ if (!net || !check_net(net)) {
+ /* Link auto-detached or netns dying */
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ rcu_assign_pointer(net->bpf.progs[type], new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return ret;
+}
+
+static int bpf_netns_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ unsigned int inum = 0;
+ struct net *net;
+
+ mutex_lock(&netns_bpf_mutex);
+ net = net_link->net;
+ if (net && check_net(net))
+ inum = net->ns.inum;
+ mutex_unlock(&netns_bpf_mutex);
+
+ info->netns.netns_ino = inum;
+ info->netns.attach_type = net_link->type;
+ return 0;
+}
+
+static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_link_info info = {};
+
+ bpf_netns_link_fill_info(link, &info);
+ seq_printf(seq,
+ "netns_ino:\t%u\n"
+ "attach_type:\t%u\n",
+ info.netns.netns_ino,
+ info.netns.attach_type);
+}
+
+static const struct bpf_link_ops bpf_netns_link_ops = {
+ .release = bpf_netns_link_release,
+ .dealloc = bpf_netns_link_dealloc,
+ .update_prog = bpf_netns_link_update_prog,
+ .fill_link_info = bpf_netns_link_fill_info,
+ .show_fdinfo = bpf_netns_link_show_fdinfo,
+};
+
+int netns_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_id, prog_cnt = 0, flags = 0;
+ enum netns_bpf_attach_type type;
+ struct bpf_prog *attached;
+ struct net *net;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ type = to_netns_bpf_attach_type(attr->query.attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->query.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ rcu_read_lock();
+ attached = rcu_dereference(net->bpf.progs[type]);
+ if (attached) {
+ prog_cnt = 1;
+ prog_id = attached->aux->id;
+ }
+ rcu_read_unlock();
+
+ put_net(net);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ return -EFAULT;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ return 0;
+
+ if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type type;
+ struct net *net;
+ int ret;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Attaching prog directly is not compatible with links */
+ if (net->bpf.links[type]) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ ret = flow_dissector_bpf_prog_attach(net, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+
+ return ret;
+}
+
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_detach(struct net *net,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_prog *attached;
+
+ /* Progs attached via links cannot be detached */
+ if (net->bpf.links[type])
+ return -EINVAL;
+
+ attached = rcu_dereference_protected(net->bpf.progs[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (!attached)
+ return -ENOENT;
+ RCU_INIT_POINTER(net->bpf.progs[type], NULL);
+ bpf_prog_put(attached);
+ return 0;
+}
+
+int netns_bpf_prog_detach(const union bpf_attr *attr)
+{
+ enum netns_bpf_attach_type type;
+ int ret;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ mutex_lock(&netns_bpf_mutex);
+ ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type);
+ mutex_unlock(&netns_bpf_mutex);
+
+ return ret;
+}
+
+static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ int err;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Allow attaching only one prog or link for now */
+ if (net->bpf.links[type]) {
+ err = -E2BIG;
+ goto out_unlock;
+ }
+ /* Links are not compatible with attaching prog directly */
+ prog = rcu_dereference_protected(net->bpf.progs[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (prog) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ err = flow_dissector_bpf_prog_attach(net, link->prog);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ goto out_unlock;
+
+ net->bpf.links[type] = link;
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return err;
+}
+
+int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type netns_type;
+ struct bpf_link_primer link_primer;
+ struct bpf_netns_link *net_link;
+ enum bpf_attach_type type;
+ struct net *net;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ type = attr->link_create.attach_type;
+ netns_type = to_netns_bpf_attach_type(type);
+ if (netns_type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->link_create.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ net_link = kzalloc(sizeof(*net_link), GFP_USER);
+ if (!net_link) {
+ err = -ENOMEM;
+ goto out_put_net;
+ }
+ bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
+ &bpf_netns_link_ops, prog);
+ net_link->net = net;
+ net_link->type = type;
+ net_link->netns_type = netns_type;
+
+ err = bpf_link_prime(&net_link->link, &link_primer);
+ if (err) {
+ kfree(net_link);
+ goto out_put_net;
+ }
+
+ err = netns_bpf_link_attach(net, &net_link->link, netns_type);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_net;
+ }
+
+ put_net(net);
+ return bpf_link_settle(&link_primer);
+
+out_put_net:
+ put_net(net);
+ return err;
+}
+
+static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
+{
+ enum netns_bpf_attach_type type;
+ struct bpf_link *link;
+
+ mutex_lock(&netns_bpf_mutex);
+ for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
+ link = net->bpf.links[type];
+ if (link)
+ bpf_netns_link_auto_detach(link);
+ else
+ __netns_bpf_prog_detach(net, type);
+ }
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
+ .pre_exit = netns_bpf_pernet_pre_exit,
+};
+
+static int __init netns_bpf_init(void)
+{
+ return register_pernet_subsys(&netns_bpf_pernet_ops);
+}
+
+subsys_initcall(netns_bpf_init);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..05c8e043b9d2 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -19,7 +19,7 @@ struct bpf_queue_stack {
u32 head, tail;
u32 size; /* max_entries + 1 */
- char elements[0] __aligned(8);
+ char elements[] __aligned(8);
};
static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 01badd3eda7a..21cde24386db 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
u64 array_size;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
new file mode 100644
index 000000000000..180414bb0d3e
--- /dev/null
+++ b/kernel/bpf/ringbuf.c
@@ -0,0 +1,501 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <uapi/linux/btf.h>
+
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+
+/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
+#define RINGBUF_PGOFF \
+ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
+/* consumer page and producer page */
+#define RINGBUF_POS_PAGES 2
+
+#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
+
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and take
+ * into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
+ * ring buffer.
+ */
+#define RINGBUF_MAX_DATA_SZ \
+ (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+
+struct bpf_ringbuf {
+ wait_queue_head_t waitq;
+ struct irq_work work;
+ u64 mask;
+ struct page **pages;
+ int nr_pages;
+ spinlock_t spinlock ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to allow
+ * mapping consumer page as r/w, but restrict producer page to r/o.
+ * This protects producer position from being modified by user-space
+ * application and ruining in-kernel position tracking.
+ */
+ unsigned long consumer_pos __aligned(PAGE_SIZE);
+ unsigned long producer_pos __aligned(PAGE_SIZE);
+ char data[] __aligned(PAGE_SIZE);
+};
+
+struct bpf_ringbuf_map {
+ struct bpf_map map;
+ struct bpf_map_memory memory;
+ struct bpf_ringbuf *rb;
+};
+
+/* 8-byte ring buffer record header structure */
+struct bpf_ringbuf_hdr {
+ u32 len;
+ u32 pg_off;
+};
+
+static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
+{
+ const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO;
+ int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_data_pages = data_sz >> PAGE_SHIFT;
+ int nr_pages = nr_meta_pages + nr_data_pages;
+ struct page **pages, *page;
+ struct bpf_ringbuf *rb;
+ size_t array_size;
+ int i;
+
+ /* Each data page is mapped twice to allow "virtual"
+ * continuous read of samples wrapping around the end of ring
+ * buffer area:
+ * ------------------------------------------------------
+ * | meta pages | real data pages | same data pages |
+ * ------------------------------------------------------
+ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
+ * ------------------------------------------------------
+ * | | TA DA | TA DA |
+ * ------------------------------------------------------
+ * ^^^^^^^
+ * |
+ * Here, no need to worry about special handling of wrapped-around
+ * data due to double-mapped data pages. This works both in kernel and
+ * when mmap()'ed in user-space, simplifying both kernel and
+ * user-space implementations significantly.
+ */
+ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
+ if (array_size > PAGE_SIZE)
+ pages = vmalloc_node(array_size, numa_node);
+ else
+ pages = kmalloc_node(array_size, flags, numa_node);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = alloc_pages_node(numa_node, flags, 0);
+ if (!page) {
+ nr_pages = i;
+ goto err_free_pages;
+ }
+ pages[i] = page;
+ if (i >= nr_meta_pages)
+ pages[nr_data_pages + i] = page;
+ }
+
+ rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
+ VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ if (rb) {
+ rb->pages = pages;
+ rb->nr_pages = nr_pages;
+ return rb;
+ }
+
+err_free_pages:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kvfree(pages);
+ return NULL;
+}
+
+static void bpf_ringbuf_notify(struct irq_work *work)
+{
+ struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
+
+ wake_up_all(&rb->waitq);
+}
+
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+{
+ struct bpf_ringbuf *rb;
+
+ if (!data_sz || !PAGE_ALIGNED(data_sz))
+ return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_64BIT
+ /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
+ if (data_sz > RINGBUF_MAX_DATA_SZ)
+ return ERR_PTR(-E2BIG);
+#endif
+
+ rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
+ if (!rb)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&rb->spinlock);
+ init_waitqueue_head(&rb->waitq);
+ init_irq_work(&rb->work, bpf_ringbuf_notify);
+
+ rb->mask = data_sz - 1;
+ rb->consumer_pos = 0;
+ rb->producer_pos = 0;
+
+ return rb;
+}
+
+static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ u64 cost;
+ int err;
+
+ if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->key_size || attr->value_size ||
+ attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries))
+ return ERR_PTR(-EINVAL);
+
+ rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
+ if (!rb_map)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&rb_map->map, attr);
+
+ cost = sizeof(struct bpf_ringbuf_map) +
+ sizeof(struct bpf_ringbuf) +
+ attr->max_entries;
+ err = bpf_map_charge_init(&rb_map->map.memory, cost);
+ if (err)
+ goto err_free_map;
+
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+ if (IS_ERR(rb_map->rb)) {
+ err = PTR_ERR(rb_map->rb);
+ goto err_uncharge;
+ }
+
+ return &rb_map->map;
+
+err_uncharge:
+ bpf_map_charge_finish(&rb_map->map.memory);
+err_free_map:
+ kfree(rb_map);
+ return ERR_PTR(err);
+}
+
+static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
+{
+ /* copy pages pointer and nr_pages to local variable, as we are going
+ * to unmap rb itself with vunmap() below
+ */
+ struct page **pages = rb->pages;
+ int i, nr_pages = rb->nr_pages;
+
+ vunmap(rb);
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kvfree(pages);
+}
+
+static void ringbuf_map_free(struct bpf_map *map)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ bpf_ringbuf_free(rb_map->rb);
+ kfree(rb_map);
+}
+
+static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
+{
+ size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
+
+ /* consumer page + producer page + 2 x data pages */
+ return RINGBUF_POS_PAGES + 2 * data_pages;
+}
+
+static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+ size_t mmap_sz;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
+
+ if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, rb_map->rb,
+ vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
+{
+ unsigned long cons_pos, prod_pos;
+
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+}
+
+static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+const struct bpf_map_ops ringbuf_map_ops = {
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap,
+ .map_poll = ringbuf_map_poll,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+};
+
+/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
+ * calculate offset from record metadata to ring buffer in pages, rounded
+ * down. This page offset is stored as part of record metadata and allows to
+ * restore struct bpf_ringbuf * from record pointer. This page offset is
+ * stored at offset 4 of record metadata header.
+ */
+static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
+ struct bpf_ringbuf_hdr *hdr)
+{
+ return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
+}
+
+/* Given pointer to ring buffer record header, restore pointer to struct
+ * bpf_ringbuf itself by using page offset stored at offset 4
+ */
+static struct bpf_ringbuf *
+bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
+{
+ unsigned long addr = (unsigned long)(void *)hdr;
+ unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
+
+ return (void*)((addr & PAGE_MASK) - off);
+}
+
+static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
+{
+ unsigned long cons_pos, prod_pos, new_prod_pos, flags;
+ u32 len, pg_off;
+ struct bpf_ringbuf_hdr *hdr;
+
+ if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
+ return NULL;
+
+ len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+
+ if (in_nmi()) {
+ if (!spin_trylock_irqsave(&rb->spinlock, flags))
+ return NULL;
+ } else {
+ spin_lock_irqsave(&rb->spinlock, flags);
+ }
+
+ prod_pos = rb->producer_pos;
+ new_prod_pos = prod_pos + len;
+
+ /* check for out of ringbuf space by ensuring producer position
+ * doesn't advance more than (ringbuf_size - 1) ahead
+ */
+ if (new_prod_pos - cons_pos > rb->mask) {
+ spin_unlock_irqrestore(&rb->spinlock, flags);
+ return NULL;
+ }
+
+ hdr = (void *)rb->data + (prod_pos & rb->mask);
+ pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
+ hdr->len = size | BPF_RINGBUF_BUSY_BIT;
+ hdr->pg_off = pg_off;
+
+ /* pairs with consumer's smp_load_acquire() */
+ smp_store_release(&rb->producer_pos, new_prod_pos);
+
+ spin_unlock_irqrestore(&rb->spinlock, flags);
+
+ return (void *)hdr + BPF_RINGBUF_HDR_SZ;
+}
+
+BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ if (unlikely(flags))
+ return 0;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
+ .func = bpf_ringbuf_reserve,
+ .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
+{
+ unsigned long rec_pos, cons_pos;
+ struct bpf_ringbuf_hdr *hdr;
+ struct bpf_ringbuf *rb;
+ u32 new_len;
+
+ hdr = sample - BPF_RINGBUF_HDR_SZ;
+ rb = bpf_ringbuf_restore_from_rec(hdr);
+ new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
+ if (discard)
+ new_len |= BPF_RINGBUF_DISCARD_BIT;
+
+ /* update record header with correct final size prefix */
+ xchg(&hdr->len, new_len);
+
+ /* if consumer caught up and is waiting for our record, notify about
+ * new data availability
+ */
+ rec_pos = (void *)hdr - (void *)rb->data;
+ cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
+ irq_work_queue(&rb->work);
+}
+
+BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_proto = {
+ .func = bpf_ringbuf_submit,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, true /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_proto = {
+ .func = bpf_ringbuf_discard,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_ALLOC_MEM,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
+ u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *rec;
+
+ if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
+ return -EINVAL;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ rec = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!rec)
+ return -EAGAIN;
+
+ memcpy(rec, data, size);
+ bpf_ringbuf_commit(rec, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_output_proto = {
+ .func = bpf_ringbuf_output,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ switch (flags) {
+ case BPF_RB_AVAIL_DATA:
+ return ringbuf_avail_data_sz(rb);
+ case BPF_RB_RING_SIZE:
+ return rb->mask + 1;
+ case BPF_RB_CONS_POS:
+ return smp_load_acquire(&rb->consumer_pos);
+ case BPF_RB_PROD_POS:
+ return smp_load_acquire(&rb->producer_pos);
+ default:
+ return 0;
+ }
+}
+
+const struct bpf_func_proto bpf_ringbuf_query_proto = {
+ .func = bpf_ringbuf_query,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index db76339fe358..7b8381ce40a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 42c7a42fc9c8..4d530b1d5683 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,8 @@
#include <uapi/linux/btf.h>
#include <asm/pgtable.h>
#include <linux/bpf_lsm.h>
+#include <linux/poll.h>
+#include <linux/bpf-netns.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -43,6 +45,8 @@ static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -50,9 +54,11 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/*
@@ -661,6 +667,16 @@ out:
return err;
}
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_poll)
+ return map->ops->map_poll(map, filp, pts);
+
+ return EPOLLERR;
+}
+
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
@@ -669,6 +685,7 @@ const struct file_operations bpf_map_fops = {
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
+ .poll = bpf_map_poll,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1386,7 +1403,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
- kvfree(buf_prevkey);
+ kfree(buf_prevkey);
return -ENOMEM;
}
@@ -1471,7 +1488,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1546,7 +1564,7 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
err = -EPERM;
goto err_put;
}
@@ -1562,9 +1580,11 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
@@ -1986,6 +2006,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2019,6 +2043,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ /* always unpriv */
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+ default:
+ return false;
+ }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ default:
+ return false;
+ }
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
@@ -2041,7 +2114,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return -EPERM;
/* copy eBPF program license from user space */
@@ -2054,11 +2127,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
+ return -EPERM;
+
+ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
@@ -2197,25 +2275,39 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
-void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
- struct bpf_prog *prog)
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
+ link->type = type;
+ link->id = 0;
link->ops = ops;
link->prog = prog;
}
+static void bpf_link_free_id(int id)
+{
+ if (!id)
+ return;
+
+ spin_lock_bh(&link_idr_lock);
+ idr_remove(&link_idr, id);
+ spin_unlock_bh(&link_idr_lock);
+}
+
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper manages marking bpf_link as
- * defunct, releases anon_inode file and puts reserved FD.
+ * anon_inode's release() call. This helper marksbpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
*/
-void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
- int link_fd)
+void bpf_link_cleanup(struct bpf_link_primer *primer)
{
- link->prog = NULL;
- fput(link_file);
- put_unused_fd(link_fd);
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->id);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
@@ -2226,6 +2318,7 @@ void bpf_link_inc(struct bpf_link *link)
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bpf_link_free_id(link->id);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
@@ -2267,35 +2360,35 @@ static int bpf_link_release(struct inode *inode, struct file *filp)
}
#ifdef CONFIG_PROC_FS
-static const struct bpf_link_ops bpf_raw_tp_lops;
-static const struct bpf_link_ops bpf_tracing_link_lops;
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- const char *link_type;
-
- if (link->ops == &bpf_raw_tp_lops)
- link_type = "raw_tracepoint";
- else if (link->ops == &bpf_tracing_link_lops)
- link_type = "tracing";
-#ifdef CONFIG_CGROUP_BPF
- else if (link->ops == &bpf_cgroup_link_lops)
- link_type = "cgroup";
-#endif
- else
- link_type = "unknown";
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
+ "link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
- link_type,
+ bpf_link_type_strs[link->type],
+ link->id,
prog_tag,
prog->aux->id);
+ if (link->ops->show_fdinfo)
+ link->ops->show_fdinfo(link, m);
}
#endif
@@ -2308,36 +2401,77 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
-int bpf_link_new_fd(struct bpf_link *link)
+static int bpf_link_alloc_id(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
-}
+ int id;
-/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
- * instead of immediately installing fd in fdtable, just reserve it and
- * return. Caller then need to either install it with fd_install(fd, file) or
- * release with put_unused_fd(fd).
- * This is useful for cases when bpf_link attachment/detachment are
- * complicated and expensive operations and should be delayed until all the fd
- * reservation and anon_inode creation succeeds.
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
+
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations inbetween creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
*/
-struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
- int fd;
+ int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
- return ERR_PTR(fd);
+ return fd;
+
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ return id;
+ }
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
if (IS_ERR(file)) {
+ bpf_link_free_id(id);
put_unused_fd(fd);
- return file;
+ return PTR_ERR(file);
}
- *reserved_fd = fd;
- return file;
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ spin_lock_bh(&link_idr_lock);
+ primer->link->id = primer->id;
+ spin_unlock_bh(&link_idr_lock);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -2361,6 +2495,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
struct bpf_tracing_link {
struct bpf_link link;
+ enum bpf_attach_type attach_type;
};
static void bpf_tracing_link_release(struct bpf_link *link)
@@ -2376,16 +2511,40 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
kfree(tr_link);
}
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ seq_printf(seq,
+ "attach_type:\t%d\n",
+ tr_link->attach_type);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ info->tracing.attach_type = tr_link->attach_type;
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_tracing_link_lops = {
.release = bpf_tracing_link_release,
.dealloc = bpf_tracing_link_dealloc,
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
+ .fill_link_info = bpf_tracing_link_fill_link_info,
};
static int bpf_tracing_prog_attach(struct bpf_prog *prog)
{
+ struct bpf_link_primer link_primer;
struct bpf_tracing_link *link;
- struct file *link_file;
- int link_fd, err;
+ int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
@@ -2418,24 +2577,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
err = -ENOMEM;
goto out_put_prog;
}
- bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog);
+ link->attach_type = prog->expected_attach_type;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_prog;
}
err = bpf_trampoline_link_prog(prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_prog;
}
- fd_install(link_fd, link_file);
- return link_fd;
-
+ return bpf_link_settle(&link_primer);
out_put_prog:
bpf_prog_put(prog);
return err;
@@ -2463,22 +2621,69 @@ static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
kfree(raw_tp);
}
-static const struct bpf_link_ops bpf_raw_tp_lops = {
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ seq_printf(seq,
+ "tp_name:\t%s\n",
+ raw_tp_link->btp->tp->name);
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+ const char *tp_name = raw_tp_link->btp->tp->name;
+ u32 ulen = info->raw_tracepoint.tp_name_len;
+ size_t tp_len = strlen(tp_name);
+
+ if (ulen && !ubuf)
+ return -EINVAL;
+
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
+
+ if (!ubuf)
+ return 0;
+
+ if (ulen >= tp_len + 1) {
+ if (copy_to_user(ubuf, tp_name, tp_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, tp_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
};
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
+ struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- struct file *link_file;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
- int link_fd, err;
+ int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
@@ -2531,24 +2736,23 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
err = -ENOMEM;
goto out_put_btp;
}
- bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog);
link->btp = btp;
- link_file = bpf_link_new_file(&link->link, &link_fd);
- if (IS_ERR(link_file)) {
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
kfree(link);
- err = PTR_ERR(link_file);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
- bpf_link_cleanup(&link->link, link_file, link_fd);
+ bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
- fd_install(link_fd, link_file);
- return link_fd;
+ return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
@@ -2566,6 +2770,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -2590,6 +2799,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2613,6 +2826,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ case BPF_TRACE_ITER:
+ return BPF_PROG_TYPE_TRACING;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -2629,9 +2844,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
@@ -2660,7 +2872,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ret = lirc_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
+ ret = netns_bpf_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2686,9 +2898,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
@@ -2701,7 +2910,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LIRC_MODE2:
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return skb_flow_dissector_bpf_prog_detach(attr);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return netns_bpf_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2737,6 +2948,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@@ -2750,7 +2965,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
- return skb_flow_dissector_prog_query(attr, uattr);
+ return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -2764,8 +2979,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -2816,6 +3029,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
return err;
}
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+ struct bpf_map *map;
+
+ spin_lock_bh(&map_idr_lock);
+again:
+ map = idr_get_next(&map_idr, id);
+ if (map) {
+ map = __bpf_map_inc_not_zero(map, false);
+ if (IS_ERR(map)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
@@ -3047,7 +3279,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
@@ -3329,6 +3561,42 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
return btf_get_info_by_fd(btf, attr, uattr);
}
+static int bpf_link_get_info_by_fd(struct bpf_link *link,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_link_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = link->type;
+ info.id = link->id;
+ info.prog_id = link->prog->aux->id;
+
+ if (link->ops->fill_link_info) {
+ err = link->ops->fill_link_info(link, &info);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -3353,6 +3621,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
+ else if (f.file->f_op == &bpf_link_fops)
+ err = bpf_link_get_info_by_fd(f.file->private_data,
+ attr, uattr);
else
err = -EINVAL;
@@ -3367,7 +3638,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
return btf_new_fd(attr);
@@ -3480,7 +3751,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
- if (link->ops == &bpf_raw_tp_lops) {
+ if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
@@ -3574,6 +3845,15 @@ err_put:
return err;
}
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ if (attr->link_create.attach_type == BPF_TRACE_ITER &&
+ prog->expected_attach_type == BPF_TRACE_ITER)
+ return bpf_iter_link_attach(attr, prog);
+
+ return -EINVAL;
+}
+
#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
static int link_create(union bpf_attr *attr)
{
@@ -3581,9 +3861,6 @@ static int link_create(union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
@@ -3610,6 +3887,12 @@ static int link_create(union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_TRACING:
+ ret = tracing_bpf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ ret = netns_bpf_link_create(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -3629,9 +3912,6 @@ static int link_update(union bpf_attr *attr)
u32 flags;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
@@ -3661,13 +3941,10 @@ static int link_update(union bpf_attr *attr)
goto out_put_progs;
}
-#ifdef CONFIG_CGROUP_BPF
- if (link->ops == &bpf_cgroup_link_lops) {
- ret = cgroup_bpf_replace(link, old_prog, new_prog);
- goto out_put_progs;
- }
-#endif
- ret = -EINVAL;
+ if (link->ops->update_prog)
+ ret = link->ops->update_prog(link, new_prog, old_prog);
+ else
+ ret = -EINVAL;
out_put_progs:
if (old_prog)
@@ -3679,12 +3956,131 @@ out_put_link:
return ret;
}
+static int bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd, err;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&link_idr_lock);
+ link = idr_find(&link_idr, id);
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ if (link) {
+ if (link->id)
+ err = bpf_link_inc_not_zero(link);
+ else
+ err = -EAGAIN;
+ } else {
+ err = -ENOENT;
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ if (err)
+ return err;
+
+ fd = bpf_link_new_fd(link);
+ if (fd < 0)
+ bpf_link_put(link);
+
+ return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&bpf_stats_enabled_mutex);
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+ .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+ int fd;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+
+ /* Set a very high limit to avoid overflow */
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return -EBUSY;
+ }
+
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+ if (fd >= 0)
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (attr->enable_stats.type) {
+ case BPF_STATS_RUN_TIME:
+ return bpf_enable_runtime_stats();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ int err;
+
+ if (CHECK_ATTR(BPF_ITER_CREATE))
+ return -EINVAL;
+
+ if (attr->iter_create.flags)
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ err = bpf_iter_new_fd(link);
+ bpf_link_put(link);
+
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
@@ -3796,6 +4192,19 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_LINK_UPDATE:
err = link_update(&attr);
break;
+ case BPF_LINK_GET_FD_BY_ID:
+ err = bpf_link_get_fd_by_id(&attr);
+ break;
+ case BPF_LINK_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &link_idr, &link_idr_lock);
+ break;
+ case BPF_ENABLE_STATS:
+ err = bpf_enable_stats(&attr);
+ break;
+ case BPF_ITER_CREATE:
+ err = bpf_iter_create(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..4dbf2b6035f8
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+ struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ u32 tid;
+};
+
+static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+ u32 *tid)
+{
+ struct task_struct *task = NULL;
+ struct pid *pid;
+
+ rcu_read_lock();
+retry:
+ pid = idr_get_next(&ns->idr, tid);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ++*tid;
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+
+ return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ task = task_seq_get_next(info->common.ns, &info->tid);
+ if (!task)
+ return NULL;
+
+ ++*pos;
+ return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ ++*pos;
+ ++info->tid;
+ put_task_struct((struct task_struct *)v);
+ task = task_seq_get_next(info->common.ns, &info->tid);
+ if (!task)
+ return NULL;
+
+ return task;
+}
+
+struct bpf_iter__task {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
+ bool in_stop)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__task ctx;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ meta.seq = seq;
+ ctx.meta = &meta;
+ ctx.task = task;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__task_seq_show(seq, v, true);
+ else
+ put_task_struct((struct task_struct *)v);
+}
+
+static const struct seq_operations task_seq_ops = {
+ .start = task_seq_start,
+ .next = task_seq_next,
+ .stop = task_seq_stop,
+ .show = task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ struct files_struct *files;
+ u32 tid;
+ u32 fd;
+};
+
+static struct file *
+task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
+ struct task_struct **task, struct files_struct **fstruct)
+{
+ struct pid_namespace *ns = info->common.ns;
+ u32 curr_tid = info->tid, max_fds;
+ struct files_struct *curr_files;
+ struct task_struct *curr_task;
+ int curr_fd = info->fd;
+
+ /* If this function returns a non-NULL file object,
+ * it held a reference to the task/files_struct/file.
+ * Otherwise, it does not hold any reference.
+ */
+again:
+ if (*task) {
+ curr_task = *task;
+ curr_files = *fstruct;
+ curr_fd = info->fd;
+ } else {
+ curr_task = task_seq_get_next(ns, &curr_tid);
+ if (!curr_task)
+ return NULL;
+
+ curr_files = get_files_struct(curr_task);
+ if (!curr_files) {
+ put_task_struct(curr_task);
+ curr_tid = ++(info->tid);
+ info->fd = 0;
+ goto again;
+ }
+
+ /* set *fstruct, *task and info->tid */
+ *fstruct = curr_files;
+ *task = curr_task;
+ if (curr_tid == info->tid) {
+ curr_fd = info->fd;
+ } else {
+ info->tid = curr_tid;
+ curr_fd = 0;
+ }
+ }
+
+ rcu_read_lock();
+ max_fds = files_fdtable(curr_files)->max_fds;
+ for (; curr_fd < max_fds; curr_fd++) {
+ struct file *f;
+
+ f = fcheck_files(curr_files, curr_fd);
+ if (!f)
+ continue;
+
+ /* set info->fd */
+ info->fd = curr_fd;
+ get_file(f);
+ rcu_read_unlock();
+ return f;
+ }
+
+ /* the current task is done, go to the next task */
+ rcu_read_unlock();
+ put_files_struct(curr_files);
+ put_task_struct(curr_task);
+ *task = NULL;
+ *fstruct = NULL;
+ info->fd = 0;
+ curr_tid = ++(info->tid);
+ goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct files_struct *files = NULL;
+ struct task_struct *task = NULL;
+ struct file *file;
+
+ file = task_file_seq_get_next(info, &task, &files);
+ if (!file) {
+ info->files = NULL;
+ info->task = NULL;
+ return NULL;
+ }
+
+ ++*pos;
+ info->task = task;
+ info->files = files;
+
+ return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct files_struct *files = info->files;
+ struct task_struct *task = info->task;
+ struct file *file;
+
+ ++*pos;
+ ++info->fd;
+ fput((struct file *)v);
+ file = task_file_seq_get_next(info, &task, &files);
+ if (!file) {
+ info->files = NULL;
+ info->task = NULL;
+ return NULL;
+ }
+
+ info->task = task;
+ info->files = files;
+
+ return file;
+}
+
+struct bpf_iter__task_file {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ u32 fd __aligned(8);
+ __bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+ struct task_struct *task, u32 fd,
+ struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, struct file *file,
+ bool in_stop)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct bpf_iter__task_file ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.fd = info->fd;
+ ctx.file = file;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_file_seq_show(seq, v, true);
+ } else {
+ fput((struct file *)v);
+ put_files_struct(info->files);
+ put_task_struct(info->task);
+ info->files = NULL;
+ info->task = NULL;
+ }
+}
+
+static int init_seq_pidns(void *priv_data)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ common->ns = get_pid_ns(task_active_pid_ns(current));
+ return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+ .start = task_file_seq_start,
+ .next = task_file_seq_next,
+ .stop = task_file_seq_stop,
+ .show = task_file_seq_show,
+};
+
+static const struct bpf_iter_reg task_reg_info = {
+ .target = "task",
+ .seq_ops = &task_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static const struct bpf_iter_reg task_file_reg_info = {
+ .target = "task_file",
+ .seq_ops = &task_file_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_file, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_file, file),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init task_iter_init(void)
+{
+ int ret;
+
+ ret = bpf_iter_reg_target(&task_reg_info);
+ if (ret)
+ return ret;
+
+ return bpf_iter_reg_target(&task_file_reg_info);
+}
+late_initcall(task_iter_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index efe14cf24bc6..5c7bbaac81ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -28,9 +28,11 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
};
/* bpf_check() is a static code analyzer that walks eBPF program
@@ -168,6 +170,8 @@ struct bpf_verifier_stack_elem {
int insn_idx;
int prev_insn_idx;
struct bpf_verifier_stack_elem *next;
+ /* length of verifier log at the time this state was pushed on stack */
+ u32 log_pos;
};
#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
@@ -229,6 +233,7 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
+ int mem_size;
u64 msize_max_value;
int ref_obj_id;
int func_id;
@@ -283,6 +288,18 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
log->ubuf = NULL;
}
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+ char zero = 0;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ log->len_used = new_pos;
+ if (put_user(zero, log->ubuf + new_pos))
+ log->ubuf = NULL;
+}
+
/* log_level controls verbosity level of eBPF verifier.
* bpf_verifier_log_write() is used to dump the verification trace to the log,
* so the user can figure out what's wrong with the program
@@ -377,12 +394,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool reg_type_not_null(enum bpf_reg_type type)
+{
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_BTF_ID;
+}
+
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL;
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
+ type == PTR_TO_BTF_ID_OR_NULL ||
+ type == PTR_TO_MEM_OR_NULL;
}
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -396,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
return type == PTR_TO_SOCKET ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL;
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
+ type == PTR_TO_MEM ||
+ type == PTR_TO_MEM_OR_NULL;
}
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -410,14 +440,37 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
*/
static bool is_release_function(enum bpf_func_id func_id)
{
- return func_id == BPF_FUNC_sk_release;
+ return func_id == BPF_FUNC_sk_release ||
+ func_id == BPF_FUNC_ringbuf_submit ||
+ func_id == BPF_FUNC_ringbuf_discard;
}
-static bool is_acquire_function(enum bpf_func_id func_id)
+static bool may_be_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp;
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_map_lookup_elem ||
+ func_id == BPF_FUNC_ringbuf_reserve;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_ringbuf_reserve)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -448,6 +501,9 @@ static const char * const reg_type_str[] = {
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_MEM_OR_NULL] = "mem_or_null",
};
static char slot_type_char[] = {
@@ -508,7 +564,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID)
+ if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -846,7 +902,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
- int *insn_idx)
+ int *insn_idx, bool pop_log)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem, *head = env->head;
@@ -860,6 +916,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
if (err)
return err;
}
+ if (pop_log)
+ bpf_vlog_reset(&env->log, head->log_pos);
if (insn_idx)
*insn_idx = head->insn_idx;
if (prev_insn_idx)
@@ -887,6 +945,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->log_pos = env->log.len_used;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -915,7 +974,7 @@ err:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
/* pop all elements and return */
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
return NULL;
}
@@ -1255,7 +1314,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks;
+ reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1387,8 +1446,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
continue;
if (insn[i].src_reg != BPF_PSEUDO_CALL)
continue;
- if (!env->allow_ptr_leaks) {
- verbose(env, "function calls to other bpf functions are allowed for root only\n");
+ if (!env->bpf_capable) {
+ verbose(env,
+ "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
ret = add_subprog(env, i + insn[i].imm + 1);
@@ -1922,8 +1982,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bool new_marks = false;
int i, err;
- if (!env->allow_ptr_leaks)
- /* backtracking is root only for now */
+ if (!env->bpf_capable)
return 0;
func = st->frame[st->curframe];
@@ -2101,6 +2160,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID_OR_NULL:
return true;
default:
return false;
@@ -2170,7 +2230,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
reg = &cur->regs[value_regno];
if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
- !register_is_null(reg) && env->allow_ptr_leaks) {
+ !register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
@@ -2196,7 +2256,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v4) {
bool sanitize = false;
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
@@ -2418,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
return 0;
}
-/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size, bool zero_size_allowed)
+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_map *map = regs[regno].map_ptr;
+ bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
+ struct bpf_reg_state *reg;
+
+ if (off >= 0 && size_ok && (u64)off + size <= mem_size)
+ return 0;
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
- off + size > map->value_size) {
+ reg = &cur_regs(env)[regno];
+ switch (reg->type) {
+ case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
- return -EACCES;
+ mem_size, off, size);
+ break;
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET_END:
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ off, size, regno, reg->id, off, mem_size);
+ break;
+ case PTR_TO_MEM:
+ default:
+ verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
+ mem_size, off, size);
}
- return 0;
+
+ return -EACCES;
}
-/* check read/write into a map element with possible variable offset */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
+/* check read/write into a memory region with possible variable offset */
+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
int err;
- /* We may have adjusted the register to this map value, so we
+ /* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
@@ -2464,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->smin_value + off, size,
- zero_size_allowed);
+ err = __check_mem_access(env, regno, reg->smin_value + off, size,
+ mem_size, zero_size_allowed);
if (err) {
- verbose(env, "R%d min value is outside of the array range\n",
+ verbose(env, "R%d min value is outside of the allowed memory range\n",
regno);
return err;
}
@@ -2477,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->umax_value + off, size,
- zero_size_allowed);
- if (err)
- verbose(env, "R%d max value is outside of the array range\n",
+ err = __check_mem_access(env, regno, reg->umax_value + off, size,
+ mem_size, zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d max value is outside of the allowed memory range\n",
regno);
+ return err;
+ }
+
+ return 0;
+}
+
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, bool zero_size_allowed)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
- if (map_value_has_spin_lock(reg->map_ptr)) {
- u32 lock = reg->map_ptr->spin_lock_off;
+ err = check_mem_region_access(env, regno, off, size, map->value_size,
+ zero_size_allowed);
+ if (err)
+ return err;
+
+ if (map_value_has_spin_lock(map)) {
+ u32 lock = map->spin_lock_off;
/* if any part of struct bpf_spin_lock can be touched by
* load/store reject this program.
@@ -2546,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
}
-static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size, bool zero_size_allowed)
-{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
-
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
- (u64)off + size > reg->range) {
- verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
- off, size, regno, reg->id, reg->off, reg->range);
- return -EACCES;
- }
- return 0;
-}
-
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
{
@@ -2581,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
regno);
return -EACCES;
}
- err = __check_packet_access(env, regno, off, size, zero_size_allowed);
+ err = __check_mem_access(env, regno, off, size, reg->range,
+ zero_size_allowed);
if (err) {
verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
- /* __check_packet_access has made sure "off + size - 1" is within u16.
+ /* __check_mem_access has made sure "off + size - 1" is within u16.
* reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
* otherwise find_good_pkt_pointers would have refused to set range info
- * that __check_packet_access would have rejected this pkt access.
+ * that __check_mem_access would have rejected this pkt access.
* Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
*/
env->prog->aux->max_pkt_offset =
@@ -2621,7 +2704,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID)
+ if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
*btf_id = info.btf_id;
else
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
@@ -3170,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
+ } else if (reg->type == PTR_TO_MEM) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into mem\n", value_regno);
+ return -EACCES;
+ }
+ err = check_mem_region_access(env, regno, off, size,
+ reg->mem_size, false);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
u32 btf_id = 0;
@@ -3205,7 +3298,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID)
+ if (reg_type == PTR_TO_BTF_ID ||
+ reg_type == PTR_TO_BTF_ID_OR_NULL)
regs[value_regno].btf_id = btf_id;
}
regs[value_regno].type = reg_type;
@@ -3390,7 +3484,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
* Spectre masking for stack ALU.
* See also retrieve_ptr_limit().
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -3452,6 +3546,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC;
goto mark;
}
+
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+ goto mark;
+
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
@@ -3501,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed);
+ case PTR_TO_MEM:
+ return check_mem_region_access(env, regno, reg->off,
+ access_size, reg->mem_size,
+ zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -3605,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_ALLOC_MEM ||
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+}
+
+static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+{
+ return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+}
+
static bool arg_type_is_int_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_INT ||
@@ -3664,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
- arg_type == ARG_CONST_SIZE_OR_ZERO) {
+ arg_type == ARG_CONST_SIZE_OR_ZERO ||
+ arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) {
expected_type = SCALAR_VALUE;
if (type != expected_type)
goto err_type;
@@ -3735,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* happens during stack boundary checking.
*/
if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_MEM_OR_NULL)
+ (arg_type == ARG_PTR_TO_MEM_OR_NULL ||
+ arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL))
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
+ type != PTR_TO_MEM &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+ } else if (arg_type_is_alloc_mem_ptr(arg_type)) {
+ expected_type = PTR_TO_MEM;
+ if (register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)
+ /* final test in check_stack_boundary() */;
+ else if (type != expected_type)
+ goto err_type;
+ if (meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
} else if (arg_type_is_int_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) &&
@@ -3837,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
zero_size_allowed, meta);
if (!err)
err = mark_chain_precision(env, regno);
+ } else if (arg_type_is_alloc_size(arg_type)) {
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ meta->mem_size = reg->var_off.value;
} else if (arg_type_is_int_ptr(arg_type)) {
int size = int_ptr_type_to_size(arg_type);
@@ -3873,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_xdp_output)
goto error;
break;
+ case BPF_MAP_TYPE_RINGBUF:
+ if (func_id != BPF_FUNC_ringbuf_output &&
+ func_id != BPF_FUNC_ringbuf_reserve &&
+ func_id != BPF_FUNC_ringbuf_submit &&
+ func_id != BPF_FUNC_ringbuf_discard &&
+ func_id != BPF_FUNC_ringbuf_query)
+ goto error;
+ break;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -3915,7 +4061,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_map_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -3923,7 +4070,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sock_hash_update &&
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
- func_id != BPF_FUNC_sk_select_reuseport)
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4093,7 +4241,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
/* A reference acquiring function cannot acquire
* another refcounted ptr.
*/
- if (is_acquire_function(func_id) && count)
+ if (may_be_acquire_function(func_id) && count)
return false;
/* We only support one arg being unreferenced at the moment,
@@ -4388,10 +4536,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
if (!BPF_MAP_PTR(aux->map_ptr_state))
bpf_map_ptr_store(aux, meta->map_ptr,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
return 0;
}
@@ -4597,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
+ regs[BPF_REG_0].mem_size = meta.mem_size;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -4606,7 +4759,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id)) {
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
@@ -4760,7 +4913,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
- return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+ return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
}
static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -5070,7 +5223,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* For unprivileged we require that resulting offset must be in bounds
* in order to be able to sanitize access later on.
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
if (dst_reg->type == PTR_TO_MAP_VALUE &&
check_map_access(env, dst, dst_reg->off, 1, false)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
@@ -5611,7 +5764,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
{
struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
- bool src_known, dst_known;
+ bool src_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
s32 s32_min_val, s32_max_val;
@@ -5633,7 +5786,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (alu32) {
src_known = tnum_subreg_is_const(src_reg.var_off);
- dst_known = tnum_subreg_is_const(dst_reg->var_off);
if ((src_known &&
(s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
@@ -5645,7 +5797,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
}
} else {
src_known = tnum_is_const(src_reg.var_off);
- dst_known = tnum_is_const(dst_reg->var_off);
if ((src_known &&
(smin_val != smax_val || umin_val != umax_val)) ||
smin_val > smax_val || umin_val > umax_val) {
@@ -6261,8 +6412,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
- if (__is_pointer_value(false, reg))
- return -1;
+ if (__is_pointer_value(false, reg)) {
+ if (!reg_type_not_null(reg->type))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
if (is_jmp32)
return is_branch32_taken(reg, val, opcode);
@@ -6517,12 +6685,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (is_null) {
reg->type = SCALAR_VALUE;
} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (reg->map_ptr->inner_map_meta) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
- reg->map_ptr = reg->map_ptr->inner_map_meta;
- } else if (reg->map_ptr->map_type ==
- BPF_MAP_TYPE_XSKMAP) {
+ reg->map_ptr = map->inner_map_meta;
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
@@ -6532,6 +6704,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_SOCK_COMMON;
} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
reg->type = PTR_TO_TCP_SOCK;
+ } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
+ reg->type = PTR_TO_BTF_ID;
+ } else if (reg->type == PTR_TO_MEM_OR_NULL) {
+ reg->type = PTR_TO_MEM;
}
if (is_null) {
/* We don't need id and ref_obj_id from this point
@@ -6755,7 +6931,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (pred >= 0) {
- err = mark_chain_precision(env, insn->dst_reg);
+ /* If we get here with a dst_reg pointer type it is because
+ * above is_branch_taken() special cased the 0 comparison.
+ */
+ if (!__is_pointer_value(false, dst_reg))
+ err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
if (err)
@@ -7041,7 +7221,11 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -7070,6 +7254,8 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
return 0;
+ case BPF_TRACE_ITER:
+ break;
default:
return -ENOTSUPP;
}
@@ -7206,7 +7392,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->allow_ptr_leaks)
+ if (loop_ok && env->bpf_capable)
return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
@@ -8315,7 +8501,7 @@ next:
if (env->max_states_per_insn < states_cnt)
env->max_states_per_insn = states_cnt;
- if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
return push_jmp_history(env, cur);
if (!add_new_state)
@@ -8402,6 +8588,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -8428,6 +8615,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
static int do_check(struct bpf_verifier_env *env)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
@@ -8704,7 +8892,7 @@ static int do_check(struct bpf_verifier_env *env)
process_bpf_exit:
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx);
+ &env->insn_idx, pop_log);
if (err < 0) {
if (err != -ENOENT)
return err;
@@ -9974,7 +10162,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->allow_ptr_leaks && !expect_blinding &&
+ if (env->bpf_capable && !expect_blinding &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -10227,6 +10415,7 @@ static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -10289,7 +10478,9 @@ out:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
}
- while (!pop_stack(env, NULL, NULL));
+ while (!pop_stack(env, NULL, NULL, false));
+ if (!ret && pop_log)
+ bpf_vlog_reset(&env->log, 0);
free_states(env);
if (ret)
/* clean aux data in case subprog was rejected */
@@ -10445,6 +10636,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
struct bpf_prog *tgt_prog = prog->aux->linked_prog;
u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
+ struct btf_func_model fmodel;
int ret = 0, subprog = -1, i;
struct bpf_trampoline *tr;
const struct btf_type *t;
@@ -10586,6 +10778,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_func_proto = t;
prog->aux->attach_btf_trace = true;
return 0;
+ case BPF_TRACE_ITER:
+ if (!btf_type_is_func(t)) {
+ verbose(env, "attach_btf_id %u is not a function\n",
+ btf_id);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+ prog->aux->attach_func_name = tname;
+ prog->aux->attach_func_proto = t;
+ if (!bpf_iter_prog_supported(prog))
+ return -EINVAL;
+ ret = btf_distill_func_proto(&env->log, btf, t,
+ tname, &fmodel);
+ return ret;
default:
if (!prog_extension)
return -EINVAL;
@@ -10696,7 +10904,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- is_priv = capable(CAP_SYS_ADMIN);
+ is_priv = bpf_capable();
if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
mutex_lock(&bpf_verifier_lock);
@@ -10737,7 +10945,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = is_priv;
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
+ env->bpf_capable = bpf_capable();
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
deleted file mode 100644
index 2cc5c8f4c800..000000000000
--- a/kernel/bpf/xskmap.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* XSKMAP used for AF_XDP sockets
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#include <linux/bpf.h>
-#include <linux/capability.h>
-#include <net/xdp_sock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-int xsk_map_inc(struct xsk_map *map)
-{
- bpf_map_inc(&map->map);
- return 0;
-}
-
-void xsk_map_put(struct xsk_map *map)
-{
- bpf_map_put(&map->map);
-}
-
-static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *node;
- int err;
-
- node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
- if (!node)
- return ERR_PTR(-ENOMEM);
-
- err = xsk_map_inc(map);
- if (err) {
- kfree(node);
- return ERR_PTR(err);
- }
-
- node->map = map;
- node->map_entry = map_entry;
- return node;
-}
-
-static void xsk_map_node_free(struct xsk_map_node *node)
-{
- xsk_map_put(node->map);
- kfree(node);
-}
-
-static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
-{
- spin_lock_bh(&xs->map_list_lock);
- list_add_tail(&node->node, &xs->map_list);
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static void xsk_map_sock_delete(struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- struct xsk_map_node *n, *tmp;
-
- spin_lock_bh(&xs->map_list_lock);
- list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
- if (map_entry == n->map_entry) {
- list_del(&n->node);
- xsk_map_node_free(n);
- }
- }
- spin_unlock_bh(&xs->map_list_lock);
-}
-
-static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
-{
- struct bpf_map_memory mem;
- int err, numa_node;
- struct xsk_map *m;
- u64 size;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 ||
- attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
- return ERR_PTR(-EINVAL);
-
- numa_node = bpf_map_attr_numa_node(attr);
- size = struct_size(m, xsk_map, attr->max_entries);
-
- err = bpf_map_charge_init(&mem, size);
- if (err < 0)
- return ERR_PTR(err);
-
- m = bpf_map_area_alloc(size, numa_node);
- if (!m) {
- bpf_map_charge_finish(&mem);
- return ERR_PTR(-ENOMEM);
- }
-
- bpf_map_init_from_attr(&m->map, attr);
- bpf_map_charge_move(&m->map.memory, &mem);
- spin_lock_init(&m->lock);
-
- return &m->map;
-}
-
-static void xsk_map_free(struct bpf_map *map)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
-
- bpf_clear_redirect_map(map);
- synchronize_net();
- bpf_map_area_free(m);
-}
-
-static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- u32 index = key ? *(u32 *)key : U32_MAX;
- u32 *next = next_key;
-
- if (index >= m->map.max_entries) {
- *next = 0;
- return 0;
- }
-
- if (index == m->map.max_entries - 1)
- return -ENOENT;
- *next = index + 1;
- return 0;
-}
-
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
-{
- const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
- struct bpf_insn *insn = insn_buf;
-
- *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
- *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *)));
- *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map));
- *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0);
- *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
- *insn++ = BPF_MOV64_IMM(ret, 0);
- return insn - insn_buf;
-}
-
-static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return __xsk_map_lookup_elem(map, *(u32 *)key);
-}
-
-static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
-{
- return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *xs, *old_xs, **map_entry;
- u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xsk_map_node *node;
- struct socket *sock;
- int err;
-
- if (unlikely(map_flags > BPF_EXIST))
- return -EINVAL;
- if (unlikely(i >= m->map.max_entries))
- return -E2BIG;
-
- sock = sockfd_lookup(fd, &err);
- if (!sock)
- return err;
-
- if (sock->sk->sk_family != PF_XDP) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- xs = (struct xdp_sock *)sock->sk;
-
- if (!xsk_is_setup_for_bpf_map(xs)) {
- sockfd_put(sock);
- return -EOPNOTSUPP;
- }
-
- map_entry = &m->xsk_map[i];
- node = xsk_map_node_alloc(m, map_entry);
- if (IS_ERR(node)) {
- sockfd_put(sock);
- return PTR_ERR(node);
- }
-
- spin_lock_bh(&m->lock);
- old_xs = READ_ONCE(*map_entry);
- if (old_xs == xs) {
- err = 0;
- goto out;
- } else if (old_xs && map_flags == BPF_NOEXIST) {
- err = -EEXIST;
- goto out;
- } else if (!old_xs && map_flags == BPF_EXIST) {
- err = -ENOENT;
- goto out;
- }
- xsk_map_sock_add(xs, node);
- WRITE_ONCE(*map_entry, xs);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- return 0;
-
-out:
- spin_unlock_bh(&m->lock);
- sockfd_put(sock);
- xsk_map_node_free(node);
- return err;
-}
-
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
-{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs, **map_entry;
- int k = *(u32 *)key;
-
- if (k >= map->max_entries)
- return -EINVAL;
-
- spin_lock_bh(&m->lock);
- map_entry = &m->xsk_map[k];
- old_xs = xchg(map_entry, NULL);
- if (old_xs)
- xsk_map_sock_delete(old_xs, map_entry);
- spin_unlock_bh(&m->lock);
-
- return 0;
-}
-
-void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
- struct xdp_sock **map_entry)
-{
- spin_lock_bh(&map->lock);
- if (READ_ONCE(*map_entry) == xs) {
- WRITE_ONCE(*map_entry, NULL);
- xsk_map_sock_delete(xs, map_entry);
- }
- spin_unlock_bh(&map->lock);
-}
-
-const struct bpf_map_ops xsk_map_ops = {
- .map_alloc = xsk_map_alloc,
- .map_free = xsk_map_free,
- .map_get_next_key = xsk_map_get_next_key,
- .map_lookup_elem = xsk_map_lookup_elem,
- .map_gen_lookup = xsk_map_gen_lookup,
- .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
- .map_update_elem = xsk_map_update_elem,
- .map_delete_elem = xsk_map_delete_elem,
- .map_check_btf = map_check_no_btf,
-};
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 06b5ea9d899d..557a9b9d2244 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6508,33 +6508,6 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
return ret;
}
-int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
- struct bpf_prog *new_prog)
-{
- struct bpf_cgroup_link *cg_link;
- int ret;
-
- if (link->ops != &bpf_cgroup_link_lops)
- return -EINVAL;
-
- cg_link = container_of(link, struct bpf_cgroup_link, link);
-
- mutex_lock(&cgroup_mutex);
- /* link might have been auto-released by dying cgroup, so fail */
- if (!cg_link->cgroup) {
- ret = -EINVAL;
- goto out_unlock;
- }
- if (old_prog && link->prog != old_prog) {
- ret = -EPERM;
- goto out_unlock;
- }
- ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..812a61afd538 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
return container_of(ns, struct cgroup_namespace, ns);
}
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9f892144db6b..6ff2578ecf17 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
}
+static int finish_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * idle_task_exit() will have switched to &init_mm, now
+ * clean up any remaining active_mm state.
+ */
+ if (mm != &init_mm)
+ idle->active_mm = &init_mm;
+ mmdrop(mm);
+ return 0;
+}
+
/*
* Hotplug state machine related functions
*/
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
- .teardown.single = NULL,
+ .teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2b7c9b67931d..ef94e906f05a 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -67,9 +67,7 @@ static int kgdb_break_asap;
struct debuggerinfo_struct kgdb_info[NR_CPUS];
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
+/* kgdb_connected - Is a host GDB connected to us? */
int kgdb_connected;
EXPORT_SYMBOL_GPL(kgdb_connected);
@@ -532,6 +530,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -668,6 +667,8 @@ return_normal:
if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
goto kgdb_restore;
+ atomic_inc(&ignore_console_lock_warning);
+
/* Call the I/O driver's pre_exception routine */
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
@@ -740,6 +741,8 @@ cpu_master_loop:
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
+ atomic_dec(&ignore_console_lock_warning);
+
if (!kgdb_single_step) {
raw_spin_unlock(&dbg_slave_lock);
/* Wait till all the CPUs have quit from the debugger. */
@@ -946,6 +949,14 @@ void kgdb_panic(const char *msg)
kgdb_breakpoint();
}
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ pr_crit("Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
void __weak kgdb_arch_late(void)
{
}
@@ -956,6 +967,9 @@ void __init dbg_late_init(void)
if (kgdb_io_module_registered)
kgdb_arch_late();
kdb_init(KDB_INIT_FULL);
+
+ if (kgdb_io_module_registered && kgdb_break_asap)
+ kgdb_initial_breakpoint();
}
static int
@@ -1051,14 +1065,6 @@ void kgdb_schedule_breakpoint(void)
}
EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-static void kgdb_initial_breakpoint(void)
-{
- kgdb_break_asap = 0;
-
- pr_crit("Waiting for connection from remote gdb...\n");
- kgdb_breakpoint();
-}
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -1067,15 +1073,22 @@ static void kgdb_initial_breakpoint(void)
*/
int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
{
+ struct kgdb_io *old_dbg_io_ops;
int err;
spin_lock(&kgdb_registration_lock);
- if (dbg_io_ops) {
- spin_unlock(&kgdb_registration_lock);
+ old_dbg_io_ops = dbg_io_ops;
+ if (old_dbg_io_ops) {
+ if (!old_dbg_io_ops->deinit) {
+ spin_unlock(&kgdb_registration_lock);
- pr_err("Another I/O driver is already registered with KGDB\n");
- return -EBUSY;
+ pr_err("KGDB I/O driver %s can't replace %s.\n",
+ new_dbg_io_ops->name, old_dbg_io_ops->name);
+ return -EBUSY;
+ }
+ pr_info("Replacing I/O driver %s with %s\n",
+ old_dbg_io_ops->name, new_dbg_io_ops->name);
}
if (new_dbg_io_ops->init) {
@@ -1090,12 +1103,18 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops) {
+ old_dbg_io_ops->deinit();
+ return 0;
+ }
+
pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
- if (kgdb_break_asap)
+ if (kgdb_break_asap &&
+ (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)))
kgdb_initial_breakpoint();
return 0;
@@ -1125,6 +1144,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops->deinit)
+ old_dbg_io_ops->deinit();
+
pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
@@ -1165,7 +1187,8 @@ static int __init opt_kgdb_wait(char *str)
kgdb_break_asap = 1;
kdb_init(KDB_INIT_EARLY);
- if (kgdb_io_module_registered)
+ if (kgdb_io_module_registered &&
+ IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))
kgdb_initial_breakpoint();
return 0;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 515379cbf209..ec190569f690 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -62,7 +62,7 @@ int kdb_grep_trailing;
/*
* Kernel debugger state flags
*/
-int kdb_flags;
+unsigned int kdb_flags;
/*
* kdb_lock protects updates to kdb_initial_cpu. Used to
@@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv)
argv[2]);
return 0;
}
- kdb_flags = (kdb_flags &
- ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK))
| (debugflags << KDB_DEBUG_FLAG_SHIFT);
return 0;
@@ -1108,7 +1107,8 @@ static int handle_ctrl_cmd(char *cmd)
switch (*cmd) {
case CTRL_P:
if (cmdptr != cmd_tail)
- cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) %
+ KDB_CMD_HISTORY_COUNT;
strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
case CTRL_N:
@@ -2081,7 +2081,8 @@ static int kdb_env(int argc, const char **argv)
}
if (KDB_DEBUG(MASK))
- kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+ kdb_printf("KDBDEBUG=0x%x\n",
+ (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT);
return 0;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b1991043b7d8..334d48b16c36 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -236,7 +236,7 @@ exit_put:
* sysctl_perf_event_max_contexts_per_stack.
*/
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e296c5c59c6f..2e330f330303 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -442,8 +442,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -467,8 +466,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b772f2c671b..c300253a7b8e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -228,8 +228,9 @@ repeat:
goto repeat;
}
-void rcuwait_wake_up(struct rcuwait *w)
+int rcuwait_wake_up(struct rcuwait *w)
{
+ int ret = 0;
struct task_struct *task;
rcu_read_lock();
@@ -237,7 +238,7 @@ void rcuwait_wake_up(struct rcuwait *w)
/*
* Order condition vs @task, such that everything prior to the load
* of @task is visible. This is the condition as to why the user called
- * rcuwait_trywake() in the first place. Pairs with set_current_state()
+ * rcuwait_wake() in the first place. Pairs with set_current_state()
* barrier (A) in rcuwait_wait_event().
*
* WAIT WAKE
@@ -249,8 +250,10 @@ void rcuwait_wake_up(struct rcuwait *w)
task = rcu_dereference(w->task);
if (task)
- wake_up_process(task);
+ ret = wake_up_process(task);
rcu_read_unlock();
+
+ return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);
@@ -708,8 +711,12 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- profile_task_exit(tsk);
- kcov_task_exit(tsk);
+ /*
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
WARN_ON(blk_needs_flush_plug(tsk));
@@ -727,6 +734,16 @@ void __noreturn do_exit(long code)
*/
set_fs(USER_DS);
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ profile_task_exit(tsk);
+ kcov_task_exit(tsk);
+
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -744,13 +761,6 @@ void __noreturn do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20d501af4f2e..d63c324895ea 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -72,6 +72,7 @@ config IRQ_DOMAIN
config IRQ_SIM
bool
select IRQ_WORK
+ select IRQ_DOMAIN
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index b992f88c5613..48006608baf0 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,14 +1,31 @@
// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
+ * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com>
*/
-#include <linux/slab.h>
-#include <linux/irq_sim.h>
#include <linux/irq.h>
+#include <linux/irq_sim.h>
+#include <linux/irq_work.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct irq_sim_work_ctx {
+ struct irq_work work;
+ int irq_base;
+ unsigned int irq_count;
+ unsigned long *pending;
+ struct irq_domain *domain;
+};
+
+struct irq_sim_irq_ctx {
+ int irqnum;
+ bool enabled;
+ struct irq_sim_work_ctx *work_ctx;
+};
struct irq_sim_devres {
- struct irq_sim *sim;
+ struct irq_domain *domain;
};
static void irq_sim_irqmask(struct irq_data *data)
@@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type)
return 0;
}
+static int irq_sim_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool *state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled)
+ *state = test_bit(hwirq, irq_ctx->work_ctx->pending);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int irq_sim_set_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled) {
+ assign_bit(hwirq, irq_ctx->work_ctx->pending, state);
+ if (state)
+ irq_work_queue(&irq_ctx->work_ctx->work);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct irq_chip irq_sim_irqchip = {
- .name = "irq_sim",
- .irq_mask = irq_sim_irqmask,
- .irq_unmask = irq_sim_irqunmask,
- .irq_set_type = irq_sim_set_type,
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
+ .irq_get_irqchip_state = irq_sim_get_irqchip_state,
+ .irq_set_irqchip_state = irq_sim_set_irqchip_state,
};
static void irq_sim_handle_irq(struct irq_work *work)
{
struct irq_sim_work_ctx *work_ctx;
unsigned int offset = 0;
- struct irq_sim *sim;
int irqnum;
work_ctx = container_of(work, struct irq_sim_work_ctx, work);
- sim = container_of(work_ctx, struct irq_sim, work_ctx);
- while (!bitmap_empty(work_ctx->pending, sim->irq_count)) {
+ while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) {
offset = find_next_bit(work_ctx->pending,
- sim->irq_count, offset);
+ work_ctx->irq_count, offset);
clear_bit(offset, work_ctx->pending);
- irqnum = irq_sim_irqnum(sim, offset);
+ irqnum = irq_find_mapping(work_ctx->domain, offset);
handle_simple_irq(irq_to_desc(irqnum));
}
}
+static int irq_sim_domain_map(struct irq_domain *domain,
+ unsigned int virq, irq_hw_number_t hw)
+{
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+ struct irq_sim_irq_ctx *irq_ctx;
+
+ irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL);
+ if (!irq_ctx)
+ return -ENOMEM;
+
+ irq_set_chip(virq, &irq_sim_irqchip);
+ irq_set_chip_data(virq, irq_ctx);
+ irq_set_handler(virq, handle_simple_irq);
+ irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ irq_ctx->work_ctx = work_ctx;
+
+ return 0;
+}
+
+static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+ struct irq_sim_irq_ctx *irq_ctx;
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ irq_ctx = irq_data_get_irq_chip_data(irqd);
+
+ irq_set_handler(virq, NULL);
+ irq_domain_reset_irq_data(irqd);
+ kfree(irq_ctx);
+}
+
+static const struct irq_domain_ops irq_sim_domain_ops = {
+ .map = irq_sim_domain_map,
+ .unmap = irq_sim_domain_unmap,
+};
+
/**
- * irq_sim_init - Initialize the interrupt simulator: allocate a range of
- * dummy interrupts.
+ * irq_domain_create_sim - Create a new interrupt simulator irq_domain and
+ * allocate a range of dummy interrupts.
*
- * @sim: The interrupt simulator object to initialize.
- * @num_irqs: Number of interrupts to allocate
+ * @fnode: struct fwnode_handle to be associated with this domain.
+ * @num_irqs: Number of interrupts to allocate.
*
- * On success: return the base of the allocated interrupt range.
- * On failure: a negative errno.
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
*/
-int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
{
- int i;
+ struct irq_sim_work_ctx *work_ctx;
- sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
- if (!sim->irqs)
- return -ENOMEM;
+ work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL);
+ if (!work_ctx)
+ goto err_out;
- sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
- if (sim->irq_base < 0) {
- kfree(sim->irqs);
- return sim->irq_base;
- }
+ work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!work_ctx->pending)
+ goto err_free_work_ctx;
- sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL);
- if (!sim->work_ctx.pending) {
- kfree(sim->irqs);
- irq_free_descs(sim->irq_base, num_irqs);
- return -ENOMEM;
- }
+ work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs,
+ &irq_sim_domain_ops,
+ work_ctx);
+ if (!work_ctx->domain)
+ goto err_free_bitmap;
- for (i = 0; i < num_irqs; i++) {
- sim->irqs[i].irqnum = sim->irq_base + i;
- sim->irqs[i].enabled = false;
- irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
- irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
- irq_set_handler(sim->irq_base + i, &handle_simple_irq);
- irq_modify_status(sim->irq_base + i,
- IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
- }
+ work_ctx->irq_count = num_irqs;
+ init_irq_work(&work_ctx->work, irq_sim_handle_irq);
- init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
- sim->irq_count = num_irqs;
+ return work_ctx->domain;
- return sim->irq_base;
+err_free_bitmap:
+ bitmap_free(work_ctx->pending);
+err_free_work_ctx:
+ kfree(work_ctx);
+err_out:
+ return ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL_GPL(irq_sim_init);
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
/**
- * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
- * descriptors and allocated memory.
+ * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
+ * the interrupt descriptors and allocated memory.
*
- * @sim: The interrupt simulator to tear down.
+ * @domain: The interrupt simulator domain to tear down.
*/
-void irq_sim_fini(struct irq_sim *sim)
+void irq_domain_remove_sim(struct irq_domain *domain)
{
- irq_work_sync(&sim->work_ctx.work);
- bitmap_free(sim->work_ctx.pending);
- irq_free_descs(sim->irq_base, sim->irq_count);
- kfree(sim->irqs);
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+
+ irq_work_sync(&work_ctx->work);
+ bitmap_free(work_ctx->pending);
+ kfree(work_ctx);
+
+ irq_domain_remove(domain);
}
-EXPORT_SYMBOL_GPL(irq_sim_fini);
+EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
-static void devm_irq_sim_release(struct device *dev, void *res)
+static void devm_irq_domain_release_sim(struct device *dev, void *res)
{
struct irq_sim_devres *this = res;
- irq_sim_fini(this->sim);
+ irq_domain_remove_sim(this->domain);
}
/**
- * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ * devm_irq_domain_create_sim - Create a new interrupt simulator for
+ * a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @sim: The interrupt simulator object to initialize.
+ * @fnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
- * On success: return the base of the allocated interrupt range.
- * On failure: a negative errno.
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
*/
-int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
- unsigned int num_irqs)
+struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
{
struct irq_sim_devres *dr;
- int rv;
- dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+ dr = devres_alloc(devm_irq_domain_release_sim,
+ sizeof(*dr), GFP_KERNEL);
if (!dr)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- rv = irq_sim_init(sim, num_irqs);
- if (rv < 0) {
+ dr->domain = irq_domain_create_sim(fwnode, num_irqs);
+ if (IS_ERR(dr->domain)) {
devres_free(dr);
- return rv;
+ return dr->domain;
}
- dr->sim = sim;
devres_add(dev, dr);
-
- return rv;
-}
-EXPORT_SYMBOL_GPL(devm_irq_sim_init);
-
-/**
- * irq_sim_fire - Enqueue an interrupt.
- *
- * @sim: The interrupt simulator object.
- * @offset: Offset of the simulated interrupt which should be fired.
- */
-void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
-{
- if (sim->irqs[offset].enabled) {
- set_bit(offset, sim->work_ctx.pending);
- irq_work_queue(&sim->work_ctx.work);
- }
-}
-EXPORT_SYMBOL_GPL(irq_sim_fire);
-
-/**
- * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
- *
- * @sim: The interrupt simulator object.
- * @offset: Offset of the simulated interrupt for which to retrieve
- * the number.
- */
-int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
-{
- return sim->irqs[offset].irqnum;
+ return dr->domain;
}
-EXPORT_SYMBOL_GPL(irq_sim_irqnum);
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 35b8d97c3a1d..a4c2c915511d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct device_node *of_node = to_of_node(fwnode);
struct irqchip_fwid *fwid;
struct irq_domain *domain;
static atomic_t unknown_domains;
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
- GFP_KERNEL, of_node_to_nid(of_node));
+ GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
if (!domain)
return NULL;
@@ -162,30 +161,16 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->name = fwid->name;
break;
}
-#ifdef CONFIG_ACPI
- } else if (is_acpi_device_node(fwnode)) {
- struct acpi_buffer buf = {
- .length = ACPI_ALLOCATE_BUFFER,
- };
- acpi_handle handle;
-
- handle = acpi_device_handle(to_acpi_device_node(fwnode));
- if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
- domain->name = buf.pointer;
- domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
- }
-
- domain->fwnode = fwnode;
-#endif
- } else if (of_node) {
+ } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) ||
+ is_software_node(fwnode)) {
char *name;
/*
- * DT paths contain '/', which debugfs is legitimately
+ * fwnode paths contain '/', which debugfs is legitimately
* unhappy about. Replace them with ':', which does
* the trick and is not as offensive as '\'...
*/
- name = kasprintf(GFP_KERNEL, "%pOF", of_node);
+ name = kasprintf(GFP_KERNEL, "%pfw", fwnode);
if (!name) {
kfree(domain);
return NULL;
@@ -210,7 +195,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
}
- of_node_get(of_node);
+ fwnode_handle_get(fwnode);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
@@ -259,7 +244,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
- of_node_put(irq_domain_get_of_node(domain));
+ fwnode_handle_put(domain->fwnode);
if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
kfree(domain->name);
kfree(domain);
@@ -1047,6 +1032,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
return virq;
}
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
+
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
* irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
@@ -1248,18 +1245,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
EXPORT_SYMBOL(irq_domain_set_info);
/**
- * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
- * @irq_data: The pointer to irq_data
- */
-void irq_domain_reset_irq_data(struct irq_data *irq_data)
-{
- irq_data->hwirq = 0;
- irq_data->chip = &no_irq_chip;
- irq_data->chip_data = NULL;
-}
-EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
-
-/**
* irq_domain_free_irqs_common - Clear irq_data and free the parent
* @domain: Interrupt domain to match
* @virq: IRQ number to start with
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 453a8a0f4804..761911168438 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
if (chip->irq_get_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
if (chip->irq_set_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 48b5d1b6af4d..eca83965b631 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
- oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &work->llnode);
} else {
__irq_work_queue_local(work);
}
@@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
return true;
}
+void irq_work_single(void *arg)
+{
+ struct irq_work *work = arg;
+ int flags;
+
+ /*
+ * Clear the PENDING bit, after this point the @work
+ * can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
+ */
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+
+ lockdep_irq_work_enter(work);
+ work->func(work);
+ lockdep_irq_work_exit(work);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ flags &= ~IRQ_WORK_PENDING;
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
@@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- llist_for_each_entry_safe(work, tmp, llnode, llnode) {
- int flags;
- /*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
- */
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
-
- lockdep_irq_work_enter(work);
- work->func(work);
- lockdep_irq_work_exit(work);
- /*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
- */
- flags &= ~IRQ_WORK_PENDING;
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
- }
+ llist_for_each_entry_safe(work, tmp, llnode, llnode)
+ irq_work_single(work);
}
/*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3f310df4a693..0fbdee78266b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void)
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
+ void *buffer, size_t *length,
loff_t *ppos)
{
int ret;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 8d1c15832e55..166d7bf49666 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void)
return 0;
}
-int sysctl_latencytop(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int err;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dd3cc0854c32..4c057dd8e93b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3616,13 +3616,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
/*
* Hardirqs will be enabled:
*/
-static void __trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(void)
{
struct task_struct *curr = current;
- /* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
-
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
@@ -3635,15 +3632,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))
- return;
-
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(hardirqs_on_events);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
}
-void lockdep_hardirqs_on(unsigned long ip)
+/**
+ * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
+ * @ip: Caller address
+ *
+ * Invoked before a possible transition to RCU idle from exit to user or
+ * guest mode. This ensures that all RCU operations are done before RCU
+ * stops watching. After the RCU transition lockdep_hardirqs_on() has to be
+ * invoked to set the final state.
+ */
+void lockdep_hardirqs_on_prepare(unsigned long ip)
{
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
@@ -3679,20 +3680,62 @@ void lockdep_hardirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
return;
+ current->hardirq_chain_key = current->curr_chain_key;
+
current->lockdep_recursion++;
- __trace_hardirqs_on_caller(ip);
+ __trace_hardirqs_on_caller();
lockdep_recursion_finish();
}
-NOKPROBE_SYMBOL(lockdep_hardirqs_on);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
+
+void noinstr lockdep_hardirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || curr->lockdep_recursion))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * Ensure the lock stack remained unchanged between
+ * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on().
+ */
+ DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
+ current->curr_chain_key);
+
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
+}
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void lockdep_hardirqs_off(unsigned long ip)
+void noinstr lockdep_hardirqs_off(unsigned long ip)
{
struct task_struct *curr = current;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!debug_locks || curr->lockdep_recursion))
return;
/*
@@ -3710,10 +3753,11 @@ void lockdep_hardirqs_off(unsigned long ip)
curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(hardirqs_off_events);
- } else
+ } else {
debug_atomic_inc(redundant_hardirqs_off);
+ }
}
-NOKPROBE_SYMBOL(lockdep_hardirqs_off);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -4389,8 +4433,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
dump_stack();
}
-static int match_held_lock(const struct held_lock *hlock,
- const struct lockdep_map *lock)
+static noinstr int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
@@ -4677,7 +4721,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip)
return 0;
}
-static nokprobe_inline
+static __always_inline
int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
@@ -4937,7 +4981,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held_type(const struct lockdep_map *lock, int read)
+noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
diff --git a/kernel/module.c b/kernel/module.c
index a0f201d2e184..be5413903d20 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4,6 +4,9 @@
Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
*/
+
+#define INCLUDE_VERMAGIC
+
#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ed9882108cd2..b03df67621d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,8 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -257,37 +259,296 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+static int check_setns_flags(unsigned long flags)
{
- struct task_struct *tsk = current;
- struct nsproxy *new_nsproxy;
- struct file *file;
- struct ns_common *ns;
- int err;
+ if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
+ CLONE_NEWCGROUP)))
+ return -EINVAL;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
+#ifndef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET)
+ return -EINVAL;
+#endif
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ return 0;
+}
+
+static void put_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+
+ if (flags & CLONE_NEWUSER)
+ put_cred(nsset_cred(nsset));
+ /*
+ * We only created a temporary copy if we attached to more than just
+ * the mount namespace.
+ */
+ if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
+ free_fs_struct(nsset->fs);
+ if (nsset->nsproxy)
+ free_nsproxy(nsset->nsproxy);
+}
+
+static int prepare_nsset(unsigned flags, struct nsset *nsset)
+{
+ struct task_struct *me = current;
+
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ if (IS_ERR(nsset->nsproxy))
+ return PTR_ERR(nsset->nsproxy);
- new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
- if (IS_ERR(new_nsproxy)) {
- err = PTR_ERR(new_nsproxy);
+ if (flags & CLONE_NEWUSER)
+ nsset->cred = prepare_creds();
+ else
+ nsset->cred = current_cred();
+ if (!nsset->cred)
goto out;
+
+ /* Only create a temporary copy of fs_struct if we really need to. */
+ if (flags == CLONE_NEWNS) {
+ nsset->fs = me->fs;
+ } else if (flags & CLONE_NEWNS) {
+ nsset->fs = copy_fs_struct(me->fs);
+ if (!nsset->fs)
+ goto out;
}
- err = ns->ops->install(new_nsproxy, ns);
- if (err) {
- free_nsproxy(new_nsproxy);
- goto out;
+ nsset->flags = flags;
+ return 0;
+
+out:
+ put_nsset(nsset);
+ return -ENOMEM;
+}
+
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
+/*
+ * This is the inverse operation to unshare().
+ * Ordering is equivalent to the standard ordering used everywhere else
+ * during unshare and process creation. The switch to the new set of
+ * namespaces occurs at the point of no return after installation of
+ * all requested namespaces was successful in commit_nsset().
+ */
+static int validate_nsset(struct nsset *nsset, struct pid *pid)
+{
+ int ret = 0;
+ unsigned flags = nsset->flags;
+ struct user_namespace *user_ns = NULL;
+ struct pid_namespace *pid_ns = NULL;
+ struct nsproxy *nsp;
+ struct task_struct *tsk;
+
+ /* Take a "snapshot" of the target task's namespaces. */
+ rcu_read_lock();
+ tsk = pid_task(pid, PIDTYPE_PID);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+
+ task_lock(tsk);
+ nsp = tsk->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ task_unlock(tsk);
+ if (!nsp) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ pid_ns = task_active_pid_ns(tsk);
+ if (unlikely(!pid_ns)) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_pid_ns(pid_ns);
+ }
+#endif
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ user_ns = get_user_ns(__task_cred(tsk)->user_ns);
+#endif
+ rcu_read_unlock();
+
+ /*
+ * Install requested namespaces. The caller will have
+ * verified earlier that the requested namespaces are
+ * supported on this kernel. We don't report errors here
+ * if a namespace is requested that isn't supported.
+ */
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ ret = validate_ns(nsset, &user_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+ if (flags & CLONE_NEWNS) {
+ ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS) {
+ ret = validate_ns(nsset, &nsp->uts_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC) {
+ ret = validate_ns(nsset, &nsp->ipc_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ ret = validate_ns(nsset, &pid_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP) {
+ ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET) {
+ ret = validate_ns(nsset, &nsp->net_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+out:
+ if (pid_ns)
+ put_pid_ns(pid_ns);
+ if (nsp)
+ put_nsproxy(nsp);
+ put_user_ns(user_ns);
+
+ return ret;
+}
+
+/*
+ * This is the point of no return. There are just a few namespaces
+ * that do some actual work here and it's sufficiently minimal that
+ * a separate ns_common operation seems unnecessary for now.
+ * Unshare is doing the same thing. If we'll end up needing to do
+ * more in a given namespace or a helper here is ultimately not
+ * exported anymore a simple commit handler for each namespace
+ * should be added to ns_common.
+ */
+static void commit_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+ struct task_struct *me = current;
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ /* transfer ownership */
+ commit_creds(nsset_cred(nsset));
+ nsset->cred = NULL;
+ }
+#endif
+
+ /* We only need to commit if we have used a temporary fs_struct. */
+ if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
+ set_fs_root(me->fs, &nsset->fs->root);
+ set_fs_pwd(me->fs, &nsset->fs->pwd);
}
- switch_task_namespaces(tsk, new_nsproxy);
- perf_event_namespaces(tsk);
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ exit_sem(me);
+#endif
+
+ /* transfer ownership */
+ switch_task_namespaces(me, nsset->nsproxy);
+ nsset->nsproxy = NULL;
+}
+
+SYSCALL_DEFINE2(setns, int, fd, int, flags)
+{
+ struct file *file;
+ struct ns_common *ns = NULL;
+ struct nsset nsset = {};
+ int err = 0;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ if (proc_ns_file(file)) {
+ ns = get_proc_ns(file_inode(file));
+ if (flags && (ns->ops->type != flags))
+ err = -EINVAL;
+ flags = ns->ops->type;
+ } else if (!IS_ERR(pidfd_pid(file))) {
+ err = check_setns_flags(flags);
+ } else {
+ err = -EBADF;
+ }
+ if (err)
+ goto out;
+
+ err = prepare_nsset(flags, &nsset);
+ if (err)
+ goto out;
+
+ if (proc_ns_file(file))
+ err = validate_ns(&nsset, ns);
+ else
+ err = validate_nsset(&nsset, file->private_data);
+ if (!err) {
+ commit_nsset(&nsset);
+ perf_event_namespaces(current);
+ }
+ put_nsset(&nsset);
out:
fput(file);
return err;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 01f8ba32cc0c..0e5ac162c3a8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
@@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9fdd6a42ad6a..3132d6f860a8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
unsigned int old;
diff --git a/kernel/relay.c b/kernel/relay.c
index d0c9c287680a..90c7a002436d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1177,10 +1177,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = relay_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
+ .release = relay_pipe_buf_release,
+ .try_steal = generic_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
};
static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0ae29fd57817..8298b2c240ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -21,6 +21,7 @@
#include "../smpboot.h"
#include "pelt.h"
+#include "smp.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -220,6 +221,13 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+ csd->flags = 0;
+ csd->func = func;
+ csd->info = rq;
+}
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -315,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
+
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
#endif
-
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
@@ -633,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
- return false;
-}
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance && !need_resched()) {
+ rq->nohz_idle_balance = flags;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1111,8 +1111,7 @@ static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
@@ -1540,7 +1539,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2274,16 +2273,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -2293,56 +2299,30 @@ void sched_ttwu_pending(void)
rq_unlock_irqrestore(rq, &rf);
}
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
-
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
-
- /*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
- *
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
- */
- irq_enter();
- sched_ttwu_pending();
+ struct rq *rq = cpu_rq(cpu);
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
- irq_exit();
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry);
}
void wake_up_if_idle(int cpu)
@@ -2373,6 +2353,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2381,11 +2393,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq_flags rf;
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
- }
#endif
rq_lock(rq, &rf);
@@ -2569,7 +2578,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#ifdef CONFIG_SMP
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -2593,6 +2610,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ */
+ if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+ goto unlock;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
@@ -2602,28 +2629,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2751,6 +2762,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2767,7 +2781,7 @@ void set_numabalancing_state(bool enabled)
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2841,8 +2855,8 @@ static void __init init_schedstats(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -3951,6 +3965,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ const struct sched_class *class;
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
@@ -3984,22 +4020,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
restart:
-#ifdef CONFIG_SMP
- /*
- * We must do the balancing pass before put_next_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
- break;
- }
-#endif
-
- put_prev_task(rq, prev);
+ put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
@@ -4689,7 +4710,7 @@ int idle_cpu(int cpu)
return 0;
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -6243,13 +6264,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -6539,7 +6561,6 @@ int sched_cpu_dying(unsigned int cpu)
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6642,6 +6663,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6694,7 +6717,6 @@ void __init sched_init(void)
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
@@ -6716,7 +6738,6 @@ void __init sched_init(void)
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -6744,6 +6765,8 @@ void __init sched_init(void)
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
+
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -7438,6 +7461,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -7466,6 +7491,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..941c28cf9738 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <asm/irq_regs.h>
#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = task_pt_regs(tsk);
+ struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
@@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+ __this_cpu_add(ca->cpuusage->usages[index], cputime);
rcu_read_unlock();
}
@@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
- this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+ __this_cpu_add(ca->cpustat->cpustat[index], val);
rcu_read_unlock();
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 239970b991c0..36c54265bb2b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
@@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
(long long)(p->nvcsw + p->nivcsw),
@@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
+ SEQ_printf(m, " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n");
SEQ_printf(m, "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ "------------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -638,7 +638,6 @@ do { \
P(nr_running);
P(nr_switches);
- P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3e5b54715b..35f4cc024dcf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -191,7 +191,7 @@ static void update_sysctl(void)
#undef SET_SYSCTL
}
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
{
update_sysctl();
}
@@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
@@ -1094,7 +1093,7 @@ struct numa_group {
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
- unsigned long faults[0];
+ unsigned long faults[];
};
/*
@@ -3441,52 +3440,46 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+ se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/* Nothing to update */
if (!delta)
return;
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
}
static inline void
@@ -3496,19 +3489,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
unsigned long load_avg;
u64 load_sum = 0;
s64 delta_sum;
+ u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
@@ -3533,7 +3533,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+ load_avg = div_s64(load_sum, divider);
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
delta_avg = load_avg - se->avg.load_avg;
@@ -3697,6 +3697,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
/*
@@ -3873,6 +3877,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -4054,7 +4060,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -4588,16 +4594,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4609,13 +4615,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4722,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4744,29 +4782,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);
- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
/*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
- /*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
- */
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4933,14 +4955,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/*
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
- cfs_b->distribute_running = 1;
+ while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
@@ -5054,10 +5074,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
- if (cfs_b->distribute_running) {
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
- return;
- }
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
@@ -5067,9 +5083,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- if (runtime)
- cfs_b->distribute_running = 1;
-
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
@@ -5078,7 +5091,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -5139,8 +5151,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -5170,6 +5181,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -5199,8 +5212,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5221,7 +5232,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
- cfs_b->distribute_running = 0;
cfs_b->slack_started = false;
}
@@ -5506,28 +5516,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
list_add_leaf_cfs_rq(cfs_rq);
}
-enqueue_throttle:
- if (!se) {
- add_nr_running(rq, 1);
- /*
- * Since new tasks are assigned an initial util_avg equal to
- * half of the spare capacity of their CPU, tiny tasks have the
- * ability to cross the overutilized threshold, which will
- * result in the load balancer ruining all the task placement
- * done by EAS. As a way to mitigate that effect, do not account
- * for the first enqueue operation of new tasks during the
- * overutilized flag detection.
- *
- * A better way of solving this problem would be to wait for
- * the PELT signals of tasks to converge before taking them
- * into account, but that is not straightforward to implement,
- * and the following generally works well enough in practice.
- */
- if (flags & ENQUEUE_WAKEUP)
- update_overutilized_status(rq);
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, 1);
- }
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
@@ -5737,7 +5746,7 @@ static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
@@ -5846,8 +5855,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag);
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5930,7 +5938,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
+ group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
@@ -6671,9 +6679,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
rcu_read_lock();
for_each_domain(cpu, tmp) {
- if (!(tmp->flags & SD_LOAD_BALANCE))
- break;
-
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
@@ -8584,7 +8589,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
*/
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -8702,8 +8707,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -9434,7 +9438,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu, balance_cpu = -1;
+ int cpu;
/*
* Ensure the balancing environment is consistent; can happen
@@ -9455,18 +9459,12 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
- balance_cpu = cpu;
- break;
+ /* Are we the first idle CPU? */
+ return cpu == env->dst_cpu;
}
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
-
- /*
- * First idle CPU or the first CPU(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
+ /* Are we the first CPU of this group ? */
+ return group_balance_cpu(sg) == env->dst_cpu;
}
/*
@@ -9819,9 +9817,8 @@ static int active_load_balance_cpu_stop(void *data)
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
}
if (likely(sd)) {
@@ -9910,9 +9907,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
}
max_cost += sd->max_newidle_lb_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
@@ -10029,17 +10023,20 @@ static void kick_ilb(unsigned int flags)
if (ilb_cpu >= nr_cpu_ids)
return;
+ /*
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+ * the first flag owns it; cleared by nohz_csd_func().
+ */
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
+ * This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
- smp_send_reschedule(ilb_cpu);
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
@@ -10377,20 +10374,14 @@ abort:
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- int this_cpu = this_rq->cpu;
- unsigned int flags;
+ unsigned int flags = this_rq->nohz_idle_balance;
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ if (!flags)
return false;
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
+ this_rq->nohz_idle_balance = 0;
- /* could be _relaxed() */
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- if (!(flags & NOHZ_KICK_MASK))
+ if (idle != CPU_IDLE)
return false;
_nohz_idle_balance(this_rq, flags, idle);
@@ -10450,7 +10441,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -10501,9 +10492,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
int continue_balancing = 1;
u64 t0, domain_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b743bf38f08f..05deb81bb3e3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -289,7 +289,11 @@ static void do_idle(void)
*/
smp_mb__after_atomic();
- sched_ttwu_pending();
+ /*
+ * RCU relies on this call to be done outside of an RCU read-side
+ * critical section.
+ */
+ flush_smp_call_function_from_idle();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b647d04d9c8b..b4b1ff96642f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
return 1;
}
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ * LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..f395ddb75f38 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;
return 0;
@@ -2714,9 +2724,8 @@ static void sched_rt_do_global(void)
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
@@ -2754,9 +2763,8 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..1d4e94c1e5fe 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -349,7 +349,6 @@ struct cfs_bandwidth {
u8 idle;
u8 period_active;
- u8 distribute_running;
u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
@@ -890,12 +889,15 @@ struct rq {
#ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
- atomic_t nohz_flags;
+ atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned long nr_load_updates;
+#ifdef CONFIG_SMP
+ unsigned int ttwu_pending;
+#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -951,6 +953,7 @@ struct rq {
struct callback_head *balance_callback;
+ unsigned char nohz_idle_balance;
unsigned char idle_balance;
unsigned long misfit_task_load;
@@ -979,7 +982,7 @@ struct rq {
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -1020,10 +1023,6 @@ struct rq {
unsigned int ttwu_local;
#endif
-#ifdef CONFIG_SMP
- struct llist_head wake_list;
-#endif
-
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
@@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-extern void sched_ttwu_pending(void);
-
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -1461,7 +1458,7 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
- unsigned long cpumask[0];
+ unsigned long cpumask[];
};
static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
-extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
-
-#else
-
-static inline void sched_ttwu_pending(void) { }
+extern void flush_smp_call_function_from_idle(void);
-static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
-
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
#include "stats.h"
#include "autogroup.h"
@@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
*/
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
-#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
+#define WF_ON_RQ 0x08 /* Wakee is on_rq */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
new file mode 100644
index 000000000000..9620e323162c
--- /dev/null
+++ b/kernel/sched/smp.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+
+extern void sched_ttwu_pending(void *arg);
+
+extern void send_call_function_single_ipi(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8344757bba6e..ba81187bb7af 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
- return -1;
- }
-
printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
@@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
+ if (sd->flags & (SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
@@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
/* Flags needing groups don't count if only 1 group in parent */
if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ pflags &= ~(SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -209,7 +199,7 @@ bool sched_energy_update;
#ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl,
.cache_nice_tries = 0,
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
+ .flags = 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..d653d8426de9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
}
static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
diff --git a/kernel/smp.c b/kernel/smp.c
index 84303197caf9..472c2b274c65 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -22,11 +22,9 @@
#include <linux/hypervisor.h>
#include "smpboot.h"
+#include "sched/smp.h"
-enum {
- CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_SYNCHRONOUS = 0x02,
-};
+#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
struct call_function_data {
call_single_data_t __percpu *csd;
@@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* still pending.
*/
flush_smp_call_function_queue(false);
+ irq_work_run();
return 0;
}
@@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
+void __smp_call_single_queue(int cpu, struct llist_node *node)
+{
+ /*
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
+ */
+ if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+ send_call_function_single_ipi(cpu);
+}
+
/*
* Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, call_single_data_t *csd,
- smp_call_func_t func, void *info)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
{
if (cpu == smp_processor_id()) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
unsigned long flags;
/*
@@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
return 0;
}
-
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd);
return -ENXIO;
}
- csd->func = func;
- csd->info = info;
-
- /*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
- *
- * If IPIs can go out of order to the cache coherency protocol
- * in an architecture, sufficient synchronisation should be added
- * to arch code to make it appear to obey cache coherency WRT
- * locking and barrier primitives. Generic code isn't really
- * equipped to do the right thing...
- */
- if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ __smp_call_single_queue(cpu, &csd->llist);
return 0;
}
@@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void)
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
- struct llist_head *head;
- struct llist_node *entry;
call_single_data_t *csd, *csd_next;
+ struct llist_node *entry, *prev;
+ struct llist_head *head;
static bool warned;
lockdep_assert_irqs_disabled();
@@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
- llist_for_each_entry(csd, entry, llist)
- pr_warn("IPI callback %pS sent to offline CPU\n",
- csd->func);
+ llist_for_each_entry(csd, entry, llist) {
+ switch (CSD_TYPE(csd)) {
+ case CSD_TYPE_ASYNC:
+ case CSD_TYPE_SYNC:
+ case CSD_TYPE_IRQ_WORK:
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ break;
+
+ case CSD_TYPE_TTWU:
+ pr_warn("IPI task-wakeup sent to offline CPU\n");
+ break;
+
+ default:
+ pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
+ CSD_TYPE(csd));
+ break;
+ }
+ }
}
+ /*
+ * First; run all SYNC callbacks, people are waiting for us.
+ */
+ prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
- smp_call_func_t func = csd->func;
- void *info = csd->info;
-
/* Do we wait until *after* callback? */
- if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
func(info);
csd_unlock(csd);
} else {
- csd_unlock(csd);
- func(info);
+ prev = &csd->llist;
}
}
+ if (!entry)
+ return;
+
/*
- * Handle irq works queued remotely by irq_work_queue_on().
- * Smp functions above are typically synchronous so they
- * better run first since some other CPUs may be busy waiting
- * for them.
+ * Second; run all !SYNC callbacks.
*/
- irq_work_run();
+ prev = NULL;
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ int type = CSD_TYPE(csd);
+
+ if (type != CSD_TYPE_TTWU) {
+ if (prev) {
+ prev->next = &csd_next->llist;
+ } else {
+ entry = &csd_next->llist;
+ }
+
+ if (type == CSD_TYPE_ASYNC) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ csd_unlock(csd);
+ func(info);
+ } else if (type == CSD_TYPE_IRQ_WORK) {
+ irq_work_single(csd);
+ }
+
+ } else {
+ prev = &csd->llist;
+ }
+ }
+
+ /*
+ * Third; only CSD_TYPE_TTWU is left, issue those.
+ */
+ if (entry)
+ sched_ttwu_pending(entry);
+}
+
+void flush_smp_call_function_from_idle(void)
+{
+ unsigned long flags;
+
+ if (llist_empty(this_cpu_ptr(&call_single_queue)))
+ return;
+
+ local_irq_save(flags);
+ flush_smp_call_function_queue(true);
+ local_irq_restore(flags);
}
/*
@@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
- .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+ .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
};
int this_cpu;
int err;
@@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
csd_lock(csd);
}
- err = generic_exec_single(cpu, csd, func, info);
+ csd->func = func;
+ csd->info = info;
+
+ err = generic_exec_single(cpu, csd);
if (wait)
csd_lock_wait(csd);
@@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
csd->flags = CSD_FLAG_LOCK;
smp_wmb();
- err = generic_exec_single(cpu, csd, csd->func, csd->info);
+ err = generic_exec_single(cpu, csd);
out:
preempt_enable();
@@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd_lock(csd);
if (wait)
- csd->flags |= CSD_FLAG_SYNCHRONOUS;
+ csd->flags |= CSD_TYPE_SYNC;
csd->func = func;
csd->info = info;
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@@ -598,6 +669,24 @@ void __init smp_init(void)
{
int num_nodes, num_cpus;
+ /*
+ * Ensure struct irq_work layout matches so that
+ * flush_smp_call_function_queue() can do horrible things.
+ */
+ BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
+ offsetof(struct __call_single_data, llist));
+ BUILD_BUG_ON(offsetof(struct irq_work, func) !=
+ offsetof(struct __call_single_data, func));
+ BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
+ offsetof(struct __call_single_data, flags));
+
+ /*
+ * Assert the CSD_TYPE_TTWU layout is similar enough
+ * for task_struct to be on the @call_single_queue.
+ */
+ BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
+ offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
+
idle_threads_init();
cpuhp_threads_init();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f15d292e44c..715774d8c55f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,9 @@
#include <linux/bpf.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
+#include <linux/coredump.h>
+#include <linux/latencytop.h>
+#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -103,22 +106,6 @@
#if defined(CONFIG_SYSCTL)
-/* External variables not in a header file. */
-extern int suid_dumpable;
-#ifdef CONFIG_COREDUMP
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-#endif
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
-extern int percpu_pagelist_fraction;
-extern int latencytop_enabled;
-extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
-#ifndef CONFIG_MMU
-extern int sysctl_nr_trim_pages;
-#endif
-
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
@@ -161,24 +148,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
-#ifdef CONFIG_SPARC
-#endif
-
-#ifdef CONFIG_PARISC
-extern int pwrsw_enabled;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
-extern int unaligned_enabled;
-#endif
-
-#ifdef CONFIG_IA64
-extern int unaligned_dump_stack;
-#endif
-
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
-extern int no_unaligned_warning;
-#endif
#ifdef CONFIG_PROC_SYSCTL
@@ -208,102 +177,1472 @@ enum sysctl_writes_mode {
};
static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos);
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
#endif
+
+#endif /* CONFIG_SYSCTL */
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
+static int bpf_stats_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
#endif
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(char *data, int maxlen, int write,
+ char *buffer, size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char c, *p;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
+ p = buffer;
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
+ c = *(p++);
+ if (c == 0 || c == '\n')
+ break;
+ data[len++] = c;
+ }
+ data[len] = 0;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, data, len);
+ if (len < *lenp) {
+ buffer[len] = '\n';
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
+/**
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * handlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
+
+ return _proc_do_string(table->data, table->maxlen, write, buffer, lenp,
+ ppos);
+}
+
+static size_t proc_skip_spaces(char **buf)
+{
+ size_t ret;
+ char *tmp = skip_spaces(*buf);
+ ret = tmp - *buf;
+ *buf = tmp;
+ return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+ while (*size) {
+ if (**buf != v)
+ break;
+ (*size)--;
+ (*buf)++;
+ }
+}
+
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+ unsigned long *val, bool *neg,
+ const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+ int len;
+ char *p, tmp[TMPBUFLEN];
+
+ if (!*size)
+ return -EINVAL;
+
+ len = *size;
+ if (len > TMPBUFLEN - 1)
+ len = TMPBUFLEN - 1;
+
+ memcpy(tmp, *buf, len);
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = true;
+ p++;
+ } else
+ *neg = false;
+ if (!isdigit(*p))
+ return -EINVAL;
+
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
+
+ len = p - tmp;
+
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ if (len == TMPBUFLEN - 1)
+ return -EINVAL;
+
+ if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ return -EINVAL;
+
+ if (tr && (len < *size))
+ *tr = *p;
+
+ *buf += len;
+ *size -= len;
+
+ return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success @buf and @size are updated with the amount of bytes
+ * written.
+ */
+static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg)
+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ memcpy(*buf, tmp, len);
+ *size -= len;
+ *buf += len;
+}
+#undef TMPBUFLEN
+
+static void proc_put_char(void **buf, size_t *size, char c)
+{
+ if (*size) {
+ char **buffer = (char **)buf;
+ **buffer = c;
+
+ (*size)--;
+ (*buffer)++;
+ *buf = *buffer;
+ }
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = -(unsigned long)val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ int *i, vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first=0) {
+ unsigned long lval;
+ bool neg;
+
+ if (write) {
+ left -= proc_skip_spaces(&p);
+
+ if (!left)
+ break;
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
+ break;
+ }
+ } else {
+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;
+ break;
+ }
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, lval, neg);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err && left)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *p = buffer;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ proc_put_long(&buffer, &left, lval, false);
+ if (!left)
+ goto out;
+
+ proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+#endif
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ int i;
+ for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ if ((tmptaint >> i) & 1)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+ }
+
+ return err;
+}
+
#ifdef CONFIG_PRINTK
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or -EINVAL on write when the range check fails.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ int ret;
+ unsigned int tmp;
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
+
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -ERANGE;
+
+ *valp = tmp;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success or -ERANGE on write when the range check fails.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned int val;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, NULL);
+}
+
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+ );
+ }
+#endif
+}
+
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
#endif
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmp, ret;
-static struct ctl_table kern_table[];
-static struct ctl_table vm_table[];
-static struct ctl_table fs_table[];
-static struct ctl_table debug_table[];
-static struct ctl_table dev_table[];
-extern struct ctl_table random_table[];
-#ifdef CONFIG_EPOLL
-extern struct ctl_table epoll_table[];
-#endif
+ tmp = sysrq_mask();
-#ifdef CONFIG_FW_LOADER_USER_HELPER
-extern struct ctl_table firmware_config_table[];
-#endif
+ ret = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (ret || !write)
+ return ret;
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
-int sysctl_legacy_va_layout;
+ if (write)
+ sysrq_toggle_support(tmp);
+
+ return 0;
+}
#endif
-/* The default sysctl tables: */
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos,
+ unsigned long convmul, unsigned long convdiv)
+{
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ size_t left;
+ char *p;
-static struct ctl_table sysctl_base_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = kern_table,
- },
- {
- .procname = "vm",
- .mode = 0555,
- .child = vm_table,
- },
- {
- .procname = "fs",
- .mode = 0555,
- .child = fs_table,
- },
- {
- .procname = "debug",
- .mode = 0555,
- .child = debug_table,
- },
- {
- .procname = "dev",
- .mode = 0555,
- .child = dev_table,
- },
- { }
-};
+ if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
-#ifdef CONFIG_COMPACTION
-static int min_extfrag_threshold;
-static int max_extfrag_threshold = 1000;
-#endif
+ if (write) {
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ p = buffer;
+ }
+
+ for (; left && vleft--; i++, first = 0) {
+ unsigned long val;
+
+ if (write) {
+ bool neg;
+
+ left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
+
+ err = proc_get_long(&p, &left, &val, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (neg)
+ continue;
+ val = convmul * val / convdiv;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
+ *i = val;
+ } else {
+ val = convdiv * (*i) / convmul;
+ if (!first)
+ proc_put_char(&buffer, &left, '\t');
+ proc_put_long(&buffer, &left, val, false);
+ }
+ }
+
+ if (!write && !first && left && !err)
+ proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ left -= proc_skip_spaces(&p);
+ if (write && first)
+ return err ? : -EINVAL;
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > INT_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = -(unsigned long)val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+ bool first = 1;
+ size_t left = *lenp;
+ unsigned long bitmap_len = table->maxlen;
+ unsigned long *bitmap = *(unsigned long **) table->data;
+ unsigned long *tmp_bitmap = NULL;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ char *p = buffer;
+ size_t skipped = 0;
+
+ if (left > PAGE_SIZE - 1) {
+ left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
+
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
+ if (!tmp_bitmap)
+ return -ENOMEM;
+ proc_skip_char(&p, &left, '\n');
+ while (!err && left) {
+ unsigned long val_a, val_b;
+ bool neg;
+ size_t saved_left;
+
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
+ sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_a >= bitmap_len || neg) {
+ err = -EINVAL;
+ break;
+ }
+
+ val_b = val_a;
+ if (left) {
+ p++;
+ left--;
+ }
+
+ if (c == '-') {
+ err = proc_get_long(&p, &left, &val_b,
+ &neg, tr_b, sizeof(tr_b),
+ &c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
+ if (err)
+ break;
+ if (val_b >= bitmap_len || neg ||
+ val_a > val_b) {
+ err = -EINVAL;
+ break;
+ }
+ if (left) {
+ p++;
+ left--;
+ }
+ }
+
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ first = 0;
+ proc_skip_char(&p, &left, '\n');
+ }
+ left += skipped;
+ } else {
+ unsigned long bit_a, bit_b = 0;
+
+ while (left) {
+ bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ if (bit_a >= bitmap_len)
+ break;
+ bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ bit_a + 1) - 1;
+
+ if (!first)
+ proc_put_char(&buffer, &left, ',');
+ proc_put_long(&buffer, &left, bit_a, false);
+ if (bit_a != bit_b) {
+ proc_put_char(&buffer, &left, '-');
+ proc_put_long(&buffer, &left, bit_b, false);
+ }
+
+ first = 0; bit_b++;
+ }
+ proc_put_char(&buffer, &left, '\n');
+ }
+
+ if (!err) {
+ if (write) {
+ if (*ppos)
+ bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ else
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ }
+ *lenp -= left;
+ *ppos += *lenp;
+ }
+
+ bitmap_free(tmp_bitmap);
+ return err;
+}
+
+#else /* CONFIG_PROC_SYSCTL */
+
+int proc_dostring(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_PROC_SYSCTL */
+
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (val)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ }
+ mutex_unlock(&static_key_mutex);
+ return ret;
+}
static struct ctl_table kern_table[] = {
{
@@ -614,7 +1953,7 @@ static struct ctl_table kern_table[] = {
.procname = "soft-power",
.data = &pwrsw_enabled,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -995,30 +2334,32 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_X86)
+
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
{
- .procname = "panic_on_unrecovered_nmi",
- .data = &panic_on_unrecovered_nmi,
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
+#if defined(CONFIG_X86)
{
- .procname = "panic_on_io_nmi",
- .data = &panic_on_io_nmi,
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
{
- .procname = "panic_on_stackoverflow",
- .data = &sysctl_panic_on_stackoverflow,
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#endif
{
.procname = "bootloader_type",
.data = &bootloader_type,
@@ -1073,7 +2414,7 @@ static struct ctl_table kern_table[] = {
.procname = "ignore-unaligned-usertrap",
.data = &no_unaligned_warning,
.maxlen = sizeof (int),
- .mode = 0644,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
@@ -1245,7 +2586,7 @@ static struct ctl_table kern_table[] = {
.data = &bpf_stats_enabled_key.key,
.maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_do_static_key,
+ .proc_handler = bpf_stats_handler,
},
#endif
#if defined(CONFIG_TREE_RCU)
@@ -1321,7 +2662,7 @@ static struct ctl_table vm_table[] = {
.proc_handler = overcommit_kbytes_handler,
},
{
- .procname = "page-cluster",
+ .procname = "page-cluster",
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
@@ -1492,7 +2833,7 @@ static struct ctl_table vm_table[] = {
.data = &watermark_boost_factor,
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
- .proc_handler = watermark_boost_factor_sysctl_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
@@ -1960,6 +3301,35 @@ static struct ctl_table dev_table[] = {
{ }
};
+static struct ctl_table sysctl_base_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+ { }
+};
+
int __init sysctl_init(void)
{
struct ctl_table_header *hdr;
@@ -1968,1474 +3338,7 @@ int __init sysctl_init(void)
kmemleak_not_leak(hdr);
return 0;
}
-
#endif /* CONFIG_SYSCTL */
-
-/*
- * /proc/sys support
- */
-
-#ifdef CONFIG_PROC_SYSCTL
-
-static int _proc_do_string(char *data, int maxlen, int write,
- char __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- char __user *p;
- char c;
-
- if (!data || !maxlen || !*lenp) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
- /* Only continue writes not past the end of buffer. */
- len = strlen(data);
- if (len > maxlen - 1)
- len = maxlen - 1;
-
- if (*ppos > len)
- return 0;
- len = *ppos;
- } else {
- /* Start writing from beginning of buffer. */
- len = 0;
- }
-
- *ppos += *lenp;
- p = buffer;
- while ((p - buffer) < *lenp && len < maxlen - 1) {
- if (get_user(c, p++))
- return -EFAULT;
- if (c == 0 || c == '\n')
- break;
- data[len++] = c;
- }
- data[len] = 0;
- } else {
- len = strlen(data);
- if (len > maxlen)
- len = maxlen;
-
- if (*ppos > len) {
- *lenp = 0;
- return 0;
- }
-
- data += *ppos;
- len -= *ppos;
-
- if (len > *lenp)
- len = *lenp;
- if (len)
- if (copy_to_user(buffer, data, len))
- return -EFAULT;
- if (len < *lenp) {
- if (put_user('\n', buffer + len))
- return -EFAULT;
- len++;
- }
- *lenp = len;
- *ppos += len;
- }
- return 0;
-}
-
-static void warn_sysctl_write(struct ctl_table *table)
-{
- pr_warn_once("%s wrote to %s when file position was not 0!\n"
- "This will not be supported in the future. To silence this\n"
- "warning, set kernel.sysctl_writes_strict = -1\n",
- current->comm, table->procname);
-}
-
-/**
- * proc_first_pos_non_zero_ignore - check if first position is allowed
- * @ppos: file position
- * @table: the sysctl table
- *
- * Returns true if the first position is non-zero and the sysctl_writes_strict
- * mode indicates this is not allowed for numeric input types. String proc
- * handlers can ignore the return value.
- */
-static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
- struct ctl_table *table)
-{
- if (!*ppos)
- return false;
-
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- return true;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- return false;
- default:
- return false;
- }
-}
-
-/**
- * proc_dostring - read a string sysctl
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes a string from/to the user buffer. If the kernel
- * buffer provided is not large enough to hold the string, the
- * string is truncated. The copied string is %NULL-terminated.
- * If the string is being read by the user process, it is copied
- * and a newline '\n' is added. It is truncated if the buffer is
- * not large enough.
- *
- * Returns 0 on success.
- */
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write)
- proc_first_pos_non_zero_ignore(ppos, table);
-
- return _proc_do_string((char *)(table->data), table->maxlen, write,
- (char __user *)buffer, lenp, ppos);
-}
-
-static size_t proc_skip_spaces(char **buf)
-{
- size_t ret;
- char *tmp = skip_spaces(*buf);
- ret = tmp - *buf;
- *buf = tmp;
- return ret;
-}
-
-static void proc_skip_char(char **buf, size_t *size, const char v)
-{
- while (*size) {
- if (**buf != v)
- break;
- (*size)--;
- (*buf)++;
- }
-}
-
-/**
- * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
- * fail on overflow
- *
- * @cp: kernel buffer containing the string to parse
- * @endp: pointer to store the trailing characters
- * @base: the base to use
- * @res: where the parsed integer will be stored
- *
- * In case of success 0 is returned and @res will contain the parsed integer,
- * @endp will hold any trailing characters.
- * This function will fail the parse on overflow. If there wasn't an overflow
- * the function will defer the decision what characters count as invalid to the
- * caller.
- */
-static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
- unsigned long *res)
-{
- unsigned long long result;
- unsigned int rv;
-
- cp = _parse_integer_fixup_radix(cp, &base);
- rv = _parse_integer(cp, base, &result);
- if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
- return -ERANGE;
-
- cp += rv;
-
- if (endp)
- *endp = (char *)cp;
-
- *res = (unsigned long)result;
- return 0;
-}
-
-#define TMPBUFLEN 22
-/**
- * proc_get_long - reads an ASCII formatted integer from a user buffer
- *
- * @buf: a kernel buffer
- * @size: size of the kernel buffer
- * @val: this is where the number will be stored
- * @neg: set to %TRUE if number is negative
- * @perm_tr: a vector which contains the allowed trailers
- * @perm_tr_len: size of the perm_tr vector
- * @tr: pointer to store the trailer character
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes read. If @tr is non-NULL and a trailing
- * character exists (size is non-zero after returning from this
- * function), @tr is updated with the trailing character.
- */
-static int proc_get_long(char **buf, size_t *size,
- unsigned long *val, bool *neg,
- const char *perm_tr, unsigned perm_tr_len, char *tr)
-{
- int len;
- char *p, tmp[TMPBUFLEN];
-
- if (!*size)
- return -EINVAL;
-
- len = *size;
- if (len > TMPBUFLEN - 1)
- len = TMPBUFLEN - 1;
-
- memcpy(tmp, *buf, len);
-
- tmp[len] = 0;
- p = tmp;
- if (*p == '-' && *size > 1) {
- *neg = true;
- p++;
- } else
- *neg = false;
- if (!isdigit(*p))
- return -EINVAL;
-
- if (strtoul_lenient(p, &p, 0, val))
- return -EINVAL;
-
- len = p - tmp;
-
- /* We don't know if the next char is whitespace thus we may accept
- * invalid integers (e.g. 1234...a) or two integers instead of one
- * (e.g. 123...1). So lets not allow such large numbers. */
- if (len == TMPBUFLEN - 1)
- return -EINVAL;
-
- if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
- return -EINVAL;
-
- if (tr && (len < *size))
- *tr = *p;
-
- *buf += len;
- *size -= len;
-
- return 0;
-}
-
-/**
- * proc_put_long - converts an integer to a decimal ASCII formatted string
- *
- * @buf: the user buffer
- * @size: the size of the user buffer
- * @val: the integer to be converted
- * @neg: sign of the number, %TRUE for negative
- *
- * In case of success %0 is returned and @buf and @size are updated with
- * the amount of bytes written.
- */
-static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
- bool neg)
-{
- int len;
- char tmp[TMPBUFLEN], *p = tmp;
-
- sprintf(p, "%s%lu", neg ? "-" : "", val);
- len = strlen(tmp);
- if (len > *size)
- len = *size;
- if (copy_to_user(*buf, tmp, len))
- return -EFAULT;
- *size -= len;
- *buf += len;
- return 0;
-}
-#undef TMPBUFLEN
-
-static int proc_put_char(void __user **buf, size_t *size, char c)
-{
- if (*size) {
- char __user **buffer = (char __user **)buf;
- if (put_user(c, *buffer))
- return -EFAULT;
- (*size)--, (*buffer)++;
- *buf = *buffer;
- }
- return 0;
-}
-
-static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*negp) {
- if (*lvalp > (unsigned long) INT_MAX + 1)
- return -EINVAL;
- *valp = -*lvalp;
- } else {
- if (*lvalp > (unsigned long) INT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- }
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
- }
- return 0;
-}
-
-static int do_proc_douintvec_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > UINT_MAX)
- return -EINVAL;
- *valp = *lvalp;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long)val;
- }
- return 0;
-}
-
-static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
-
-static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- int *i, vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
- left = *lenp;
-
- if (!conv)
- conv = do_proc_dointvec_conv;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first=0) {
- unsigned long lval;
- bool neg;
-
- if (write) {
- left -= proc_skip_spaces(&p);
-
- if (!left)
- break;
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (conv(&neg, &lval, i, 1, data)) {
- err = -EINVAL;
- break;
- }
- } else {
- if (conv(&neg, &lval, i, 0, data)) {
- err = -EINVAL;
- break;
- }
- if (!first)
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, lval, neg);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err && left)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_dointvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec_w(unsigned int *tbl_data,
- struct ctl_table *table,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
- bool neg;
- char *kbuf = NULL, *p;
-
- left = *lenp;
-
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto bail_early;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return -EINVAL;
-
- left -= proc_skip_spaces(&p);
- if (!left) {
- err = -EINVAL;
- goto out_free;
- }
-
- err = proc_get_long(&p, &left, &lval, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err || neg) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (conv(&lval, tbl_data, 1, data)) {
- err = -EINVAL;
- goto out_free;
- }
-
- if (!err && left)
- left -= proc_skip_spaces(&p);
-
-out_free:
- kfree(kbuf);
- if (err)
- return -EINVAL;
-
- return 0;
-
- /* This is in keeping with old __do_proc_dointvec() */
-bail_early:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned long lval;
- int err = 0;
- size_t left;
-
- left = *lenp;
-
- if (conv(&lval, tbl_data, 0, data)) {
- err = -EINVAL;
- goto out;
- }
-
- err = proc_put_long(&buffer, &left, lval, false);
- if (err || !left)
- goto out;
-
- err = proc_put_char(&buffer, &left, '\n');
-
-out:
- *lenp -= left;
- *ppos += *lenp;
-
- return err;
-}
-
-static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- unsigned int *i, vleft;
-
- if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned int *) tbl_data;
- vleft = table->maxlen / sizeof(*i);
-
- /*
- * Arrays are not supported, keep this simple. *Do not* add
- * support for them.
- */
- if (vleft != 1) {
- *lenp = 0;
- return -EINVAL;
- }
-
- if (!conv)
- conv = do_proc_douintvec_conv;
-
- if (write)
- return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
- conv, data);
- return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
-}
-
-static int do_proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos,
- int (*conv)(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data),
- void *data)
-{
- return __do_proc_douintvec(table->data, table, write,
- buffer, lenp, ppos, conv, data);
-}
-
-/**
- * proc_dointvec - read a vector of integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-}
-
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
-/**
- * proc_douintvec - read a vector of unsigned integers
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * Returns 0 on success.
- */
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
-}
-
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- int i;
- for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
- if ((tmptaint >> i) & 1)
- add_taint(i, LOCKDEP_STILL_OK);
- }
- }
-
- return err;
-}
-
-#ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-#endif
-
-/**
- * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_dointvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_dointvec_minmax() handler.
- */
-struct do_proc_dointvec_minmax_conv_param {
- int *min;
- int *max;
-};
-
-static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- int tmp, ret;
- struct do_proc_dointvec_minmax_conv_param *param = data;
- /*
- * If writing, first do so via a temporary local int so we can
- * bounds-check it before touching *valp.
- */
- int *ip = write ? &tmp : valp;
-
- ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -EINVAL;
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_dointvec_minmax - read a vector of integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success or -EINVAL on write when the range check fails.
- */
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_dointvec_minmax_conv_param param = {
- .min = (int *) table->extra1,
- .max = (int *) table->extra2,
- };
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_minmax_conv, &param);
-}
-
-/**
- * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
- * @min: pointer to minimum allowable value
- * @max: pointer to maximum allowable value
- *
- * The do_proc_douintvec_minmax_conv_param structure provides the
- * minimum and maximum values for doing range checking for those sysctl
- * parameters that use the proc_douintvec_minmax() handler.
- */
-struct do_proc_douintvec_minmax_conv_param {
- unsigned int *min;
- unsigned int *max;
-};
-
-static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- int ret;
- unsigned int tmp;
- struct do_proc_douintvec_minmax_conv_param *param = data;
- /* write via temporary local uint for bounds-checking */
- unsigned int *up = write ? &tmp : valp;
-
- ret = do_proc_douintvec_conv(lvalp, up, write, data);
- if (ret)
- return ret;
-
- if (write) {
- if ((param->min && *param->min > tmp) ||
- (param->max && *param->max < tmp))
- return -ERANGE;
-
- *valp = tmp;
- }
-
- return 0;
-}
-
-/**
- * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
- * values from/to the user buffer, treated as an ASCII string. Negative
- * strings are not allowed.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max). There is a final sanity
- * check for UINT_MAX to avoid having to support wrap around uses from
- * userspace.
- *
- * Returns 0 on success or -ERANGE on write when the range check fails.
- */
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct do_proc_douintvec_minmax_conv_param param = {
- .min = (unsigned int *) table->extra1,
- .max = (unsigned int *) table->extra2,
- };
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_minmax_conv, &param);
-}
-
-static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
- unsigned int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned int val;
-
- val = round_pipe_size(*lvalp);
- if (val == 0)
- return -EINVAL;
-
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
- }
-
- return 0;
-}
-
-static int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, NULL);
-}
-
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
- if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
- printk(KERN_WARNING
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
- }
-#endif
-}
-
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
- if (!error)
- validate_coredump_safety();
- return error;
-}
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- unsigned long *i, *min, *max;
- int vleft, first = 1, err = 0;
- size_t left;
- char *kbuf = NULL, *p;
-
- if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- i = (unsigned long *) data;
- min = (unsigned long *) table->extra1;
- max = (unsigned long *) table->extra2;
- vleft = table->maxlen / sizeof(unsigned long);
- left = *lenp;
-
- if (write) {
- if (proc_first_pos_non_zero_ignore(ppos, table))
- goto out;
-
- if (left > PAGE_SIZE - 1)
- left = PAGE_SIZE - 1;
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- }
-
- for (; left && vleft--; i++, first = 0) {
- unsigned long val;
-
- if (write) {
- bool neg;
-
- left -= proc_skip_spaces(&p);
- if (!left)
- break;
-
- err = proc_get_long(&p, &left, &val, &neg,
- proc_wspace_sep,
- sizeof(proc_wspace_sep), NULL);
- if (err)
- break;
- if (neg)
- continue;
- val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max)) {
- err = -EINVAL;
- break;
- }
- *i = val;
- } else {
- val = convdiv * (*i) / convmul;
- if (!first) {
- err = proc_put_char(&buffer, &left, '\t');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, val, false);
- if (err)
- break;
- }
- }
-
- if (!write && !first && left && !err)
- err = proc_put_char(&buffer, &left, '\n');
- if (write && !err)
- left -= proc_skip_spaces(&p);
- if (write) {
- kfree(kbuf);
- if (first)
- return err ? : -EINVAL;
- }
- *lenp -= left;
-out:
- *ppos += *lenp;
- return err;
-}
-
-static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos,
- unsigned long convmul,
- unsigned long convdiv)
-{
- return __do_proc_doulongvec_minmax(table->data, table, write,
- buffer, lenp, ppos, convmul, convdiv);
-}
-
-/**
- * proc_doulongvec_minmax - read a vector of long integers with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
-}
-
-/**
- * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
- * values from/to the user buffer, treated as an ASCII string. The values
- * are treated as milliseconds, and converted to jiffies when they are stored.
- *
- * This routine will ensure the values are within the range specified by
- * table->extra1 (min) and table->extra2 (max).
- *
- * Returns 0 on success.
- */
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return do_proc_doulongvec_minmax(table, write, buffer,
- lenp, ppos, HZ, 1000l);
-}
-
-
-static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (*lvalp > INT_MAX / HZ)
- return 1;
- *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = lval / HZ;
- }
- return 0;
-}
-
-static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
- return 1;
- *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_clock_t(lval);
- }
- return 0;
-}
-
-static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
-{
- if (write) {
- unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
-
- if (jif > INT_MAX)
- return 1;
- *valp = (int)jif;
- } else {
- int val = *valp;
- unsigned long lval;
- if (val < 0) {
- *negp = true;
- lval = -(unsigned long)val;
- } else {
- *negp = false;
- lval = (unsigned long)val;
- }
- *lvalp = jiffies_to_msecs(lval);
- }
- return 0;
-}
-
-/**
- * proc_dointvec_jiffies - read a vector of integers as seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in seconds, and are converted into
- * jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: pointer to the file position
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/USER_HZ seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- do_proc_dointvec_userhz_jiffies_conv,NULL);
-}
-
-/**
- * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- * @ppos: the current position in the file
- *
- * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
- * values from/to the user buffer, treated as an ASCII string.
- * The values read are assumed to be in 1/1000 seconds, and
- * are converted into jiffies.
- *
- * Returns 0 on success.
- */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_dointvec_ms_jiffies_conv, NULL);
-}
-
-static int proc_do_cad_pid(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
-/**
- * proc_do_large_bitmap - read/write from/to a large bitmap
- * @table: the sysctl table
- * @write: %TRUE if this is a write to the sysctl file
- * @buffer: the user buffer
- * @lenp: the size of the user buffer
- * @ppos: file position
- *
- * The bitmap is stored at table->data and the bitmap length (in bits)
- * in table->maxlen.
- *
- * We use a range comma separated format (e.g. 1,3-4,10-10) so that
- * large bitmaps may be represented in a compact manner. Writing into
- * the file will clear the bitmap then update it with the given input.
- *
- * Returns 0 on success.
- */
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int err = 0;
- bool first = 1;
- size_t left = *lenp;
- unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = *(unsigned long **) table->data;
- unsigned long *tmp_bitmap = NULL;
- char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
-
- if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- char *kbuf, *p;
- size_t skipped = 0;
-
- if (left > PAGE_SIZE - 1) {
- left = PAGE_SIZE - 1;
- /* How much of the buffer we'll skip this pass */
- skipped = *lenp - left;
- }
-
- p = kbuf = memdup_user_nul(buffer, left);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
- if (!tmp_bitmap) {
- kfree(kbuf);
- return -ENOMEM;
- }
- proc_skip_char(&p, &left, '\n');
- while (!err && left) {
- unsigned long val_a, val_b;
- bool neg;
- size_t saved_left;
-
- /* In case we stop parsing mid-number, we can reset */
- saved_left = left;
- err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
- sizeof(tr_a), &c);
- /*
- * If we consumed the entirety of a truncated buffer or
- * only one char is left (may be a "-"), then stop here,
- * reset, & come back for more.
- */
- if ((left <= 1) && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_a >= bitmap_len || neg) {
- err = -EINVAL;
- break;
- }
-
- val_b = val_a;
- if (left) {
- p++;
- left--;
- }
-
- if (c == '-') {
- err = proc_get_long(&p, &left, &val_b,
- &neg, tr_b, sizeof(tr_b),
- &c);
- /*
- * If we consumed all of a truncated buffer or
- * then stop here, reset, & come back for more.
- */
- if (!left && skipped) {
- left = saved_left;
- break;
- }
-
- if (err)
- break;
- if (val_b >= bitmap_len || neg ||
- val_a > val_b) {
- err = -EINVAL;
- break;
- }
- if (left) {
- p++;
- left--;
- }
- }
-
- bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
- first = 0;
- proc_skip_char(&p, &left, '\n');
- }
- kfree(kbuf);
- left += skipped;
- } else {
- unsigned long bit_a, bit_b = 0;
-
- while (left) {
- bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
- if (bit_a >= bitmap_len)
- break;
- bit_b = find_next_zero_bit(bitmap, bitmap_len,
- bit_a + 1) - 1;
-
- if (!first) {
- err = proc_put_char(&buffer, &left, ',');
- if (err)
- break;
- }
- err = proc_put_long(&buffer, &left, bit_a, false);
- if (err)
- break;
- if (bit_a != bit_b) {
- err = proc_put_char(&buffer, &left, '-');
- if (err)
- break;
- err = proc_put_long(&buffer, &left, bit_b, false);
- if (err)
- break;
- }
-
- first = 0; bit_b++;
- }
- if (!err)
- err = proc_put_char(&buffer, &left, '\n');
- }
-
- if (!err) {
- if (write) {
- if (*ppos)
- bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
- else
- bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
- }
- *lenp -= left;
- *ppos += *lenp;
- }
-
- bitmap_free(tmp_bitmap);
- return err;
-}
-
-#else /* CONFIG_PROC_SYSCTL */
-
-int proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_douintvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_do_large_bitmap(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-#endif /* CONFIG_PROC_SYSCTL */
-
-#if defined(CONFIG_SYSCTL)
-int proc_do_static_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- struct static_key *key = (struct static_key *)table->data;
- static DEFINE_MUTEX(static_key_mutex);
- int val, ret;
- struct ctl_table tmp = {
- .data = &val,
- .maxlen = sizeof(val),
- .mode = table->mode,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- };
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&static_key_mutex);
- val = static_key_enabled(key);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
- if (write && !ret) {
- if (val)
- static_key_enable(key);
- else
- static_key_disable(key);
- }
- mutex_unlock(&static_key_mutex);
- return ret;
-}
-#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 53bce347cd50..5d9fc22d836a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int timens_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
int err;
@@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
return -EUSERS;
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
timens_set_vvar_page(current, ns);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4594..398e6eadb861 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -249,8 +249,7 @@ void timers_update_nohz(void)
}
int timer_migration_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 92ba69b716dc..3744372a24e2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -147,7 +147,7 @@ BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_proto = {
+const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -167,7 +167,7 @@ BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -198,7 +198,7 @@ BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.func = bpf_probe_read_kernel,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -253,7 +253,7 @@ BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.func = bpf_probe_read_kernel_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
@@ -487,6 +490,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+#define MAX_SEQ_PRINTF_VARARGS 12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
+#define MAX_SEQ_PRINTF_STR_LEN 128
+
+struct bpf_seq_printf_buf {
+ char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+ const void *, data, u32, data_len)
+{
+ int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+ int i, buf_used, copy_size, num_args;
+ u64 params[MAX_SEQ_PRINTF_VARARGS];
+ struct bpf_seq_printf_buf *bufs;
+ const u64 *args = data;
+
+ buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+ if (WARN_ON_ONCE(buf_used > 1)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ goto out;
+
+ if (data_len & 7)
+ goto out;
+
+ for (i = 0; i < fmt_size; i++) {
+ if (fmt[i] == '%') {
+ if (fmt[i + 1] == '%')
+ i++;
+ else if (!data || !data_len)
+ goto out;
+ }
+ }
+
+ num_args = data_len / 8;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ /* only printable ascii for now. */
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ if (fmt_cnt >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formating field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 's') {
+ /* try our best to copy */
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt],
+ (void *) (long) args[fmt_cnt],
+ MAX_SEQ_PRINTF_STR_LEN);
+ if (err < 0)
+ bufs->buf[memcpy_cnt][0] = '\0';
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'p') {
+ if (fmt[i + 1] == 0 ||
+ fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x') {
+ /* just kernel pointers */
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
+ err = -EINVAL;
+ goto out;
+ }
+ if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+
+ copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+ err = probe_kernel_read(bufs->buf[memcpy_cnt],
+ (void *) (long) args[fmt_cnt],
+ copy_size);
+ if (err < 0)
+ memset(bufs->buf[memcpy_cnt], 0, copy_size);
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ i += 2;
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ i++;
+ if (fmt[i] == 'l')
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ }
+
+ /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+ * all of them to seq_printf().
+ */
+ seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+ params[4], params[5], params[6], params[7], params[8],
+ params[9], params[10], params[11]);
+
+ err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+ this_cpu_dec(bpf_seq_printf_buf_used);
+ return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+ .func = bpf_seq_printf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+ return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+ .func = bpf_seq_write,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_write_btf_ids,
+};
+
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
@@ -698,7 +907,7 @@ BPF_CALL_0(bpf_get_current_task)
return (long) current;
}
-static const struct bpf_func_proto bpf_get_current_task_proto = {
+const struct bpf_func_proto bpf_get_current_task_proto = {
.func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -827,6 +1036,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
@@ -877,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
default:
return NULL;
}
@@ -1246,7 +1467,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
-static const struct bpf_func_proto *
+const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -1256,6 +1477,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
#endif
+ case BPF_FUNC_seq_printf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_proto :
+ NULL;
+ case BPF_FUNC_seq_write:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_write_proto :
+ NULL;
default:
return raw_tp_prog_func_proto(func_id, prog);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f12e99b387b2..3ab27022c20f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2662,7 +2662,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
@@ -6305,13 +6305,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
__free_page(spd->pages[idx]);
}
-static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = generic_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
@@ -6373,7 +6366,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &tracing_pipe_buf_ops,
+ .ops = &default_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
ssize_t ret;
@@ -7582,9 +7575,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 4d8e99fdbbbe..fb0691b8a88d 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,24 @@
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
+/*
+ * Like trace_hardirqs_on() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_on_prepare(void)
+{
+ if (this_cpu_read(tracing_irq_cpu)) {
+ if (!in_nmi())
+ trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
+ this_cpu_write(tracing_irq_cpu, 0);
+ }
+}
+EXPORT_SYMBOL(trace_hardirqs_on_prepare);
+NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
+
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
@@ -28,11 +46,31 @@ void trace_hardirqs_on(void)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);
+/*
+ * Like trace_hardirqs_off() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_off_prepare(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu)) {
+ this_cpu_write(tracing_irq_cpu, 1);
+ tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
+ if (!in_nmi())
+ trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ }
+
+}
+EXPORT_SYMBOL(trace_hardirqs_off_prepare);
+NOKPROBE_SYMBOL(trace_hardirqs_off_prepare);
+
void trace_hardirqs_off(void)
{
if (!this_cpu_read(tracing_irq_cpu)) {
@@ -56,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
diff --git a/kernel/umh.c b/kernel/umh.c
index 3474d6aa55d8..79f139a7ca03 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -641,7 +641,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8eadadc478f9..87804e0371fe 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns)
put_user_ns(to_user_ns(ns));
}
-static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
struct user_namespace *user_ns = to_user_ns(ns);
struct cred *cred;
@@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- cred = prepare_creds();
+ cred = nsset_cred(nsset);
if (!cred)
- return -ENOMEM;
+ return -EINVAL;
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
- return commit_creds(cred);
+ return 0;
}
struct ns_common *ns_get_owner(struct ns_common *ns)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f0e491193009..e488d0e2ab45 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns)
put_uts_ns(to_uts_ns(ns));
}
-static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct uts_namespace *ns = to_uts_ns(new);
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
get_uts_ns(ns);
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3732c888a949..4ca61d49885b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table)
* to observe. Should this be in kernel/sys.c ????
*/
static int proc_do_uts_string(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b6b1f54a7837..53ff2c81b084 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -661,7 +661,7 @@ static void proc_watchdog_update(void)
* proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
*/
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old, *param = table->data;
@@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog
*/
int proc_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/nmi_watchdog
*/
int proc_nmi_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!nmi_watchdog_available && write)
return -ENOTSUPP;
@@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/soft_watchdog
*/
int proc_soft_watchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
table, write, buffer, lenp, ppos);
@@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
* /proc/sys/kernel/watchdog_thresh
*/
int proc_watchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err, old;
@@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
* been brought online, if desired.
*/
int proc_watchdog_cpumask(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int err;